In [9]:
import datetime
import itertools
import matplotlib.pylab as plt
import numpy as np
import pandas as pd
import random
import pickle
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import KFold
import torch
from tqdm import tqdm
from sklearn import preprocessing
# NLP
from sklearn.decomposition import PCA
import transformers
import ssl

In [10]:
df = pd.read_csv('/content/train.csv')

In [11]:
df.head()


Unnamed: 0,LOAN_ID,ORIGINAL_LANGUAGE,DESCRIPTION,DESCRIPTION_TRANSLATED,LOAN_AMOUNT,IMAGE_ID,ACTIVITY_NAME,SECTOR_NAME,LOAN_USE,COUNTRY_CODE,COUNTRY_NAME,TOWN_NAME,CURRENCY_POLICY,CURRENCY_EXCHANGE_COVERAGE_RATE,CURRENCY,TAGS,REPAYMENT_INTERVAL,DISTRIBUTION_MODEL
0,1733169,English,Teodora is a 50-year-old married woman from th...,Teodora is a 50-year-old married woman from th...,100,3115271,Weaving,Arts,"to purchase materials like nipa palm, bamboo ...",PH,Philippines,"Maribojoc, Bohol",shared,0.1,PHP,#Elderly,monthly,field_partner
1,1546998,English,Diego is 32 years old and lives in the municip...,Diego is 32 years old and lives in the municip...,1350,2870403,Barber Shop,Services,"to buy two hair clippers, a new barber chair, ...",CO,Colombia,Apartadó,shared,0.1,COP,"user_favorite, user_favorite",monthly,field_partner
2,1808517,Spanish,"Osman, es un joven de 27 años de edad, soltero...","Osman is a young man, 27 years old, single, an...",225,3215705,Farming,Agriculture,to purchase sacks of fertilizers to care for a...,HN,Honduras,"Nueva Frontera, Santa Barbara.",shared,0.1,HNL,,bullet,field_partner
3,1452940,English,"His name is Nino, 31 years old, married to Che...","His name is Nino, 31 years old, married to Che...",350,2745031,Motorcycle Transport,Transportation,"to pay for fuel, tires and change oil for his ...",PH,Philippines,"Silang, Cavite",shared,0.1,PHP,user_favorite,monthly,field_partner
4,1778420,English,"Pictured above is Teresa, often described as a...","Pictured above is Teresa, often described as a...",625,3083800,Farming,Agriculture,to purchase hybrid seeds and fertilizer to imp...,KE,Kenya,Mumias,shared,0.1,KES,"#Eco-friendly, #Sustainable Ag, #Parent, #Elde...",bullet,field_partner


## BERT for Description Translated

In [12]:
description = df[['DESCRIPTION_TRANSLATED']]
description

Unnamed: 0,DESCRIPTION_TRANSLATED
0,Teodora is a 50-year-old married woman from th...
1,Diego is 32 years old and lives in the municip...
2,"Osman is a young man, 27 years old, single, an..."
3,"His name is Nino, 31 years old, married to Che..."
4,"Pictured above is Teresa, often described as a..."
...,...
91328,"Rider is 20 years old. He lives in San Javier,..."
91329,Carmelita works hard to support four children....
91330,"Orn, 60 years of age, appears in the photo. Sh..."
91331,"At 27 years of age, Walter is in a live-in rel..."


In [13]:
import re
def cleaning(text):
    text = re.sub("\n", " ", text) 
    text = re.sub("[^A-Za-z0-9]", " ", text) 
    text = re.sub("[' ']+", " ", text) 
    return text.lower() 

Creating a class for BERT 

In [14]:
import torch
import transformers

from transformers import BertTokenizer


class BertSequenceVectorizer:
    def __init__(self, model_name="bert-base-uncased", max_len=128):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model_name = model_name
        self.tokenizer = BertTokenizer.from_pretrained(self.model_name)
        self.bert_model = transformers.BertModel.from_pretrained(self.model_name)
        self.bert_model = self.bert_model.to(self.device)
        self.max_len = max_len

    def vectorize(self, sentence: str) -> np.array:
        inp = self.tokenizer.encode(sentence)
        len_inp = len(inp)

        if len_inp >= self.max_len:
            inputs = inp[:self.max_len]
            masks = [1] * self.max_len
        else:
            inputs = inp + [0] * (self.max_len - len_inp)
            masks = [1] * len_inp + [0] * (self.max_len - len_inp)

        inputs_tensor = torch.tensor([inputs], dtype=torch.long).to(self.device)
        masks_tensor = torch.tensor([masks], dtype=torch.long).to(self.device)

        bert_out = self.bert_model(inputs_tensor, masks_tensor)
        seq_out, pooled_out = bert_out['last_hidden_state'], bert_out['pooler_output']

        if torch.cuda.is_available():    
            return seq_out[0][0].cpu().detach().numpy() # 0番目は [CLS] token, 768 dim の文章特徴量
        else:
            return seq_out[0][0].detach().numpy()

In [15]:
BSV = BertSequenceVectorizer(
    model_name="bert-base-uncased",
    max_len=128)
features = np.stack(
    description["DESCRIPTION_TRANSLATED"].fillna("").map(lambda x: BSV.vectorize(x).reshape(-1)).values
)
features

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Token indices sequence length is longer than the specified maximum sequence length for this model (570 > 512). Running this sequence through the model will

array([[-0.24173544, -0.4984473 ,  0.5118569 , ..., -0.26480898,
         0.09317772,  0.08919012],
       [-0.21131033, -0.1476567 ,  0.5652367 , ..., -0.45297495,
         0.62749416,  0.16616036],
       [-0.2587864 ,  0.18665488,  0.26470667, ..., -0.42402166,
         0.33215922,  0.09634593],
       ...,
       [-0.12291255, -0.2524864 ,  0.04267163, ...,  0.04285635,
         0.2550213 , -0.03334789],
       [-0.12975581, -0.24003473,  0.39712143, ..., -0.15320235,
         0.17523946,  0.12909105],
       [ 0.08368339, -0.18113215, -0.04754119, ...,  0.06178788,
        -0.00276763,  0.7165465 ]], dtype=float32)

In [16]:
features.shape

(91333, 768)

In [17]:
train_description = pd.DataFrame(features)

Although the data tends to be huge and we do not want to use everything, we want to represent a multivariate data table as smaller set of variables (summary indices) in order to observe trends, jumps, clusters and outliers. In order to archive this, PCA will be implemented.

In [18]:
pca = PCA(n_components=0.80,random_state=42)
pca.fit_transform(train_description)
print(pca.explained_variance_ratio_)
print('Cumulative principal variance : ', np.sum(pca.explained_variance_ratio_)) 
print('The number of principal components: ', pca.n_components_) 

[0.39623847 0.05335096 0.04748863 0.02975008 0.0251465  0.02147628
 0.01842006 0.01704253 0.01324411 0.01205773 0.01134471 0.01039008
 0.00907982 0.00848714 0.00814909 0.00755839 0.00691977 0.0065606
 0.00638777 0.00576017 0.00560995 0.00523428 0.00495519 0.00491989
 0.0046147  0.00438272 0.00418341 0.004085   0.00390139 0.00379354
 0.00369959 0.00350213 0.00345559 0.00329167 0.00316319 0.00307948
 0.0030113  0.00282294 0.0027719  0.00271122 0.00262791 0.00254488
 0.00248884 0.00245975]
Cumulative principal variance :  0.8021633657635638
The number of principal components:  44


In [19]:
text_vec = pca.fit_transform(train_description)
output_df = pd.DataFrame(text_vec, columns=[f'bert_pca_vecs={i:03}' for i in range(text_vec.shape[1])])

In [20]:
output_df.head()

Unnamed: 0,bert_pca_vecs=000,bert_pca_vecs=001,bert_pca_vecs=002,bert_pca_vecs=003,bert_pca_vecs=004,bert_pca_vecs=005,bert_pca_vecs=006,bert_pca_vecs=007,bert_pca_vecs=008,bert_pca_vecs=009,bert_pca_vecs=010,bert_pca_vecs=011,bert_pca_vecs=012,bert_pca_vecs=013,bert_pca_vecs=014,bert_pca_vecs=015,bert_pca_vecs=016,bert_pca_vecs=017,bert_pca_vecs=018,bert_pca_vecs=019,bert_pca_vecs=020,bert_pca_vecs=021,bert_pca_vecs=022,bert_pca_vecs=023,bert_pca_vecs=024,bert_pca_vecs=025,bert_pca_vecs=026,bert_pca_vecs=027,bert_pca_vecs=028,bert_pca_vecs=029,bert_pca_vecs=030,bert_pca_vecs=031,bert_pca_vecs=032,bert_pca_vecs=033,bert_pca_vecs=034,bert_pca_vecs=035,bert_pca_vecs=036,bert_pca_vecs=037,bert_pca_vecs=038,bert_pca_vecs=039,bert_pca_vecs=040,bert_pca_vecs=041,bert_pca_vecs=042,bert_pca_vecs=043
0,-4.328352,0.546912,-0.890222,-0.93729,-0.763976,-0.342731,-0.074698,-0.079281,1.132151,-1.518477,-1.957097,1.230229,-0.513835,-0.96834,0.417453,-0.363493,-0.380266,0.622745,-0.195617,-0.764002,0.506976,-0.226333,0.094638,0.533665,0.283496,0.575223,-0.327142,-0.072799,-0.355666,-0.106975,0.797504,0.631558,-0.015597,0.264233,0.028232,0.541706,-0.290693,0.211101,-0.617502,0.39074,-0.033636,-0.115777,-0.267159,-0.414777
1,-4.69985,0.686877,1.969688,-0.876878,-0.37323,-1.609557,-1.54746,1.354966,-1.132529,0.330999,0.97082,0.514369,-0.415915,-0.706702,-0.957297,-0.066193,-0.168774,-0.024419,-0.071052,-0.496604,-0.07905,0.210807,-0.691994,-0.03314,-0.089081,-0.095193,0.146759,-0.266657,-0.162631,0.185473,0.505783,0.108108,-0.26195,-0.252206,0.58147,-0.053316,0.261154,0.119192,-0.073515,0.077395,-0.153976,0.14208,-0.294301,0.293447
2,-5.302714,0.922105,2.197651,-0.869895,-0.820012,-0.744913,-1.939083,-0.029063,-0.064893,0.513086,-1.152323,0.407385,-0.222832,0.039545,0.942697,0.143969,0.106293,0.019555,-0.022076,-0.362315,-0.648385,-0.170088,0.235913,0.079518,-0.017074,-0.621319,-0.073742,-0.011357,0.189271,-0.011693,0.640814,-0.569899,0.143973,0.468014,0.380313,0.324636,-0.309307,0.164466,0.248885,0.008778,0.299232,0.075068,-0.080489,-0.147476
3,-4.48894,0.777891,1.610945,-0.051437,-0.522079,-1.506257,-0.181531,0.48533,-0.365606,1.425,-1.391174,0.410193,0.068125,-0.011631,-0.055643,0.406913,0.350087,-0.138597,-0.696896,-0.303579,-0.118768,0.045542,0.51763,-0.106194,-0.181882,0.283582,0.101641,-0.35487,-0.117997,-0.173235,-0.274617,0.228551,0.15252,-0.350182,-0.122653,0.6038,-0.061521,0.250801,-0.229937,-0.509483,0.146945,-0.170731,-0.67915,0.011661
4,-3.044583,-0.961134,0.596959,0.917527,0.703731,-0.716093,-1.263752,-0.14153,0.46663,1.236232,1.305265,0.790094,0.372695,0.578747,-0.320374,-0.157088,0.025217,-1.067326,-0.080476,-0.761532,0.753249,-0.319859,-0.834491,0.497316,0.239272,0.972834,0.004453,-0.1628,0.125951,0.13643,0.800847,-0.543979,-0.341874,0.468301,0.92494,-0.416327,0.43821,0.001687,0.401776,0.351726,0.114663,-0.538796,-0.021726,-0.681904


Same Process for the test data

In [21]:
test = pd.read_csv('/content/test.csv')

In [22]:
test_description = test[['DESCRIPTION_TRANSLATED']]

In [23]:
test_description

Unnamed: 0,DESCRIPTION_TRANSLATED
0,Marcela is 69 years old and married with ten c...
1,Roselia is 48 years old and has five children....
2,"Ma. Marebil is a single woman, 40 years old wi..."
3,"Good day, lenders! Meet one of KBMI’s clients,..."
4,Rosemarie is a married woman with two children...
...,...
91817,"Marjorie is a resident of Tubigon, Bohol. She ..."
91818,"Hello, Kiva community! Meet Janeth, a mother e..."
91819,Komi is 32 years old and married. He is a reno...
91820,"Kalbubu is 56 years old, a widow, and she has ..."


Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.


In [24]:
BSV = BertSequenceVectorizer(
    model_name="bert-base-uncased",
    max_len=128)
features_test = np.stack(
    test["DESCRIPTION_TRANSLATED"].fillna("").map(lambda x: BSV.vectorize(x).reshape(-1)).values
)
features_test

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Token indices sequence length is longer than the specified maximum sequence length for this model (515 > 512). Running this sequence through the model will

array([[-0.02162604, -0.2814147 ,  0.13555394, ..., -0.33768883,
         0.11497159,  0.11687837],
       [ 0.05312145, -0.7417049 ,  0.21979214, ..., -0.59250534,
         0.1289405 ,  0.31543347],
       [ 0.06533924, -0.14293553, -0.149278  , ..., -0.27435064,
        -0.00088617,  0.07431131],
       ...,
       [-0.45251736,  0.05666868,  0.08293466, ..., -0.15872467,
         0.3543674 ,  0.39797008],
       [-0.11534022, -0.3343771 , -0.02148527, ...,  0.09149812,
         0.4145456 ,  0.3540789 ],
       [-0.06062814, -0.41311967, -0.1160955 , ..., -0.32696483,
         0.14316198, -0.03807072]], dtype=float32)

In [25]:
features_test.shape

(91822, 768)

In [26]:
test_description = pd.DataFrame(features_test)

In [27]:
test_text_vec = pca.transform(test_description)
output_test = pd.DataFrame(test_text_vec, columns=[f'bert_pca_vecs={i:03}' for i in range(test_text_vec.shape[1])])

In [28]:
output_test.head()

Unnamed: 0,bert_pca_vecs=000,bert_pca_vecs=001,bert_pca_vecs=002,bert_pca_vecs=003,bert_pca_vecs=004,bert_pca_vecs=005,bert_pca_vecs=006,bert_pca_vecs=007,bert_pca_vecs=008,bert_pca_vecs=009,bert_pca_vecs=010,bert_pca_vecs=011,bert_pca_vecs=012,bert_pca_vecs=013,bert_pca_vecs=014,bert_pca_vecs=015,bert_pca_vecs=016,bert_pca_vecs=017,bert_pca_vecs=018,bert_pca_vecs=019,bert_pca_vecs=020,bert_pca_vecs=021,bert_pca_vecs=022,bert_pca_vecs=023,bert_pca_vecs=024,bert_pca_vecs=025,bert_pca_vecs=026,bert_pca_vecs=027,bert_pca_vecs=028,bert_pca_vecs=029,bert_pca_vecs=030,bert_pca_vecs=031,bert_pca_vecs=032,bert_pca_vecs=033,bert_pca_vecs=034,bert_pca_vecs=035,bert_pca_vecs=036,bert_pca_vecs=037,bert_pca_vecs=038,bert_pca_vecs=039,bert_pca_vecs=040,bert_pca_vecs=041,bert_pca_vecs=042,bert_pca_vecs=043
0,6.170295,-0.690748,-1.865736,-0.306904,-1.595511,-1.846105,0.36754,0.081063,-0.683653,0.127085,-0.407417,0.929732,0.18264,0.100178,-0.1852,0.255256,-0.318823,-0.228494,-0.157352,0.033441,0.155045,0.169257,-0.50095,0.024516,-0.518127,-0.296274,0.061615,0.13587,0.155312,0.027033,-0.295363,-0.346842,0.218114,-0.035163,-0.049701,0.278694,0.13637,0.396159,0.221567,-0.014382,-0.226553,0.088901,0.023095,0.019331
1,-3.255446,-0.283549,-3.525486,0.052102,-0.278628,-1.646948,-0.388735,0.560824,-0.107734,-1.864558,-0.712423,0.601644,1.197472,0.353959,0.572607,1.211849,-0.624038,-0.194473,0.638242,0.00241,0.843246,0.856276,0.076611,-0.207161,0.542033,0.394053,-0.044305,-0.023075,0.151455,0.155093,-0.489161,-0.052052,-0.502948,0.246793,-0.547196,0.136511,0.346534,0.501351,-0.006699,-0.397846,0.048952,-0.31863,-0.464946,-0.221054
2,6.610266,-0.797629,-0.687071,1.126617,-1.068634,-0.582964,0.70772,-0.266406,0.292716,-0.336308,-0.356667,0.327231,-0.598059,0.903781,-0.56979,0.449259,0.97239,0.211276,0.425722,0.941577,-0.865074,0.791385,0.690819,0.9608,-0.02879,-0.017075,0.34114,0.100032,-0.15366,0.242144,0.150386,0.065117,0.516765,0.103205,-0.082316,0.37163,-0.234286,-0.223301,0.322273,0.29425,0.231131,-0.199324,-0.370217,-0.119013
3,-2.28119,-0.358886,-0.887316,0.412723,0.178742,0.308955,0.774114,-0.60752,1.488114,-1.005726,-0.203751,-0.168378,-0.633581,-0.470987,0.276373,-0.468284,1.111052,-0.524657,-0.194813,0.990015,-0.095974,-0.010105,-0.210703,0.293341,0.017062,-0.026513,-0.140598,0.246092,0.599914,-0.159552,-0.20392,0.688838,0.158829,-0.396127,-0.33676,-0.01662,0.381623,-0.006172,0.689686,-0.0881,0.460922,-0.152149,0.428985,0.527931
4,6.225177,-1.270165,-1.157007,0.268485,-0.659381,-1.341322,0.000325,-0.020471,-0.638572,0.144701,-0.029222,0.301983,-0.199325,0.218927,-0.465866,0.016926,0.432289,-0.21497,0.306702,-0.322111,-0.223349,0.293164,0.03223,0.160952,0.053682,-0.245039,-0.192106,0.058049,0.114024,-0.353104,-0.219628,-0.03467,-0.22451,-0.369351,-0.158818,-0.291249,-0.134087,-0.133217,-0.34322,0.099678,-0.029167,0.33177,0.002951,0.032612


In [29]:
output_df.to_csv('translated_df.csv',index=False)
output_test.to_csv('translated_test.csv',index=False)

## BERT for Loan use

In [32]:
df = pd.read_csv('/content/train.csv')
loan_use = df[['LOAN_USE']]

In [33]:
loan_use.head()

Unnamed: 0,LOAN_USE
0,"to purchase materials like nipa palm, bamboo ..."
1,"to buy two hair clippers, a new barber chair, ..."
2,to purchase sacks of fertilizers to care for a...
3,"to pay for fuel, tires and change oil for his ..."
4,to purchase hybrid seeds and fertilizer to imp...


In [34]:
loan_use.isnull().sum()

LOAN_USE    0
dtype: int64

In [36]:
BSV = BertSequenceVectorizer(
    model_name="bert-base-uncased",
    max_len=128)
features_loan = np.stack(
    loan_use["LOAN_USE"].fillna("").map(lambda x: BSV.vectorize(x).reshape(-1)).values
)
features_loan

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


array([[-0.42625305, -0.18134747, -0.5691531 , ..., -0.35933882,
        -0.01911237,  0.20568702],
       [-0.18472293,  0.14233777, -0.4939079 , ..., -0.01785411,
         0.31023964,  0.41539967],
       [-0.69709086,  0.00822136, -0.69294524, ...,  0.07566786,
         0.2995988 ,  0.62147367],
       ...,
       [-0.1269912 ,  0.16883278, -0.3274002 , ...,  0.0737204 ,
        -0.12253695,  0.28383365],
       [-0.41563365,  0.38399398, -0.7371895 , ..., -0.3353454 ,
         0.66786915,  0.38453695],
       [-0.6315305 ,  0.01077621, -0.06907047, ..., -0.11072415,
         0.24069753,  0.7386902 ]], dtype=float32)

In [37]:
features_loan.shape

(91333, 768)

In [38]:
train_loan = pd.DataFrame(features_loan)

In [39]:
pca = PCA(n_components=0.70,random_state=42)
pca.fit_transform(train_loan)
print(pca.explained_variance_ratio_)
print('Cumulative principal variance : ', np.sum(pca.explained_variance_ratio_)) 
print('The number of principal components: ', pca.n_components_) 

[0.19791166 0.09294052 0.06528146 0.05184975 0.0413109  0.03365116
 0.02788574 0.02556001 0.02139069 0.01980715 0.01884646 0.01620687
 0.01518014 0.01364982 0.0127708  0.01165373 0.01057499 0.00956617
 0.00902119 0.00841581]
Cumulative principal variance :  0.7034750148691867
The number of principal components:  20


In [40]:
text_vec = pca.fit_transform(train_loan)
output_df_loan = pd.DataFrame(text_vec, columns=[f'bert_pca_vecs={i:03}' for i in range(text_vec.shape[1])])

In [41]:
output_df_loan.head()

Unnamed: 0,bert_pca_vecs=000,bert_pca_vecs=001,bert_pca_vecs=002,bert_pca_vecs=003,bert_pca_vecs=004,bert_pca_vecs=005,bert_pca_vecs=006,bert_pca_vecs=007,bert_pca_vecs=008,bert_pca_vecs=009,bert_pca_vecs=010,bert_pca_vecs=011,bert_pca_vecs=012,bert_pca_vecs=013,bert_pca_vecs=014,bert_pca_vecs=015,bert_pca_vecs=016,bert_pca_vecs=017,bert_pca_vecs=018,bert_pca_vecs=019
0,-1.873963,-4.209408,-1.984227,1.945851,-0.655921,0.047117,0.224468,0.894264,0.782604,0.543482,-0.164095,-0.967971,-0.340828,-0.3258,-2.563616,0.503421,0.32743,-0.226971,-0.099085,0.372613
1,-2.106485,1.533609,-0.006375,0.5056,-0.567935,-1.166042,2.849667,-0.927671,-2.052044,-0.424963,1.129651,-0.213084,0.005675,0.544461,0.069029,-0.363557,0.029903,0.208493,0.45944,-0.153268
2,4.581669,-0.668851,0.198025,-1.563793,0.834485,-0.072483,-0.603854,-0.629993,-0.246351,-0.751647,-0.273579,1.10096,0.175702,-0.393267,1.006804,-1.402969,0.398263,0.258996,1.027182,0.033943
3,-2.962453,-0.123048,-0.977478,-0.547293,-0.460527,-1.399536,2.117586,-0.214067,-1.046557,0.844355,-0.337443,0.169384,-0.287711,0.193614,-0.161869,-0.513428,0.29397,-0.169428,-0.499021,0.301853
4,1.680044,-4.007218,-0.291104,-2.347064,-1.643721,1.483778,-1.986727,-1.702743,0.539635,-1.132773,0.438819,-1.402346,-1.428143,0.130683,-2.815745,-0.476651,1.439592,-0.889644,0.554334,-0.943748


In [42]:
test_loan_use = test[['LOAN_USE']]

In [43]:
BSV = BertSequenceVectorizer(
    model_name="bert-base-uncased",
    max_len=128)
features_test_loan = np.stack(
    test_loan_use["LOAN_USE"].fillna("").map(lambda x: BSV.vectorize(x).reshape(-1)).values
)
features_test_loan

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


array([[-0.15475358, -0.22458985, -0.7245282 , ..., -0.07552186,
         0.22399327,  0.32503837],
       [-0.26769018, -0.0563505 , -0.45401907, ..., -0.2470951 ,
         0.14184983,  0.3993165 ],
       [-0.2690688 , -0.09582914, -0.4195484 , ..., -0.22475171,
        -0.08977056,  0.4331357 ],
       ...,
       [-0.36572364, -0.11463489, -0.67984647, ..., -0.00687652,
         0.21289454,  0.648167  ],
       [-0.22462344, -0.33622926, -0.7478062 , ..., -0.00740216,
         0.10952365,  0.4831206 ],
       [-0.32016847, -0.17805609, -0.6668866 , ..., -0.29715493,
         0.30646473,  0.27811107]], dtype=float32)

In [44]:
features_test_loan.shape

(91822, 768)

In [45]:
test_loan = pd.DataFrame(features_test_loan)

In [46]:
test_text_vec = pca.transform(test_loan)
output_test_loan = pd.DataFrame(test_text_vec, columns=[f'bert_pca_vecs={i:03}' for i in range(test_text_vec.shape[1])])

In [47]:
output_test_loan.head()

Unnamed: 0,bert_pca_vecs=000,bert_pca_vecs=001,bert_pca_vecs=002,bert_pca_vecs=003,bert_pca_vecs=004,bert_pca_vecs=005,bert_pca_vecs=006,bert_pca_vecs=007,bert_pca_vecs=008,bert_pca_vecs=009,bert_pca_vecs=010,bert_pca_vecs=011,bert_pca_vecs=012,bert_pca_vecs=013,bert_pca_vecs=014,bert_pca_vecs=015,bert_pca_vecs=016,bert_pca_vecs=017,bert_pca_vecs=018,bert_pca_vecs=019
0,1.464348,2.054109,-1.210934,2.141972,-2.380678,0.437955,-0.320017,-0.0958,0.274478,0.545697,-1.180859,1.098655,0.464951,0.393744,-0.645396,-1.077443,-0.238121,0.224336,0.411929,-0.211481
1,-2.60472,-1.10116,-0.998541,0.653458,1.758448,0.439231,-1.799644,-1.791374,-0.829903,0.561219,-0.291154,-0.33727,-1.208851,0.337044,0.688946,0.001357,0.353752,0.187292,-0.446153,0.721414
2,-1.023414,2.061169,-1.92329,0.78887,0.007787,-1.010087,0.305962,-0.148998,1.699789,0.591855,0.065822,-0.312215,0.608632,1.14469,0.846098,-0.369058,-0.394486,-0.237144,-0.536992,0.080827
3,-0.859172,4.013445,2.70058,-0.085576,-0.550115,-0.055093,-1.640168,0.095442,0.640373,0.334183,1.214276,-0.215234,0.167419,-0.617937,-0.284667,0.183852,0.263447,0.144234,-0.365589,-0.82401
4,-5.878501,-0.574069,-0.860044,1.353584,-0.098561,0.9188,0.140739,-1.484944,-0.428838,0.654548,0.824629,1.127113,0.16112,-0.739084,0.251277,0.184213,-0.228299,0.349612,0.097553,0.453247


In [48]:
output_df_loan.to_csv('loan_use_df.csv',index=False)
output_test_loan.to_csv('loan_use_test.csv',index=False)