In [None]:
from pyspark import init_spark
import pyspark.sql.functions as sf
from pyspark.sql.window import Window as sw
from pyspark.sql.types import StringType, IntegerType, ArrayType
spark = init_spark({"appName": 'fasttext_card_type_features'})

In [1]:
# Features for fasttext card_type vector

%%time
train = spark.table('alfa.andrey_auto_train')
train = train.sort(['app_id', 'transaction_number'])
train = train.withColumn('card_type', sf.col('card_type').cast(StringType()))
train = train.groupBy('app_id').agg(sf.collect_list("card_type").alias('card_type'))
target_train = spark.table('alfa.andrey_auto_target_train')
target_train = target_train.select(['app_id', 'flag'])
train = train.join(target_train, on=['app_id'], how='inner')

train = train.toPandas()
train['card_type'] = train['card_type'].str.join(' ')
train['label'] = train['flag'].map({0: 'good', 1: 'bad'})
train['labels_text'] = '__label__' + train['label']
train.labels_text = train.labels_text.str.cat(train['card_type'], sep=' ')
train = train.sort_values('app_id').reset_index(drop=True)

training_file = open('./train_ft_card_type.txt','w')
training_file.writelines(train.labels_text + '\n')
training_file.close()
train.to_pickle('./train_ft_card_type.pkl')




test = spark.table('alfa.andrey_auto_test')
test = test.sort(['app_id', 'transaction_number'])
test = test.withColumn('card_type', sf.col('card_type').cast(StringType()))
test = test.groupBy('app_id').agg(sf.collect_list("card_type").alias('card_type'))

test = test.toPandas()
test['card_type'] = test['card_type'].str.join(' ')
test = test.sort_values('app_id').reset_index(drop=True)
test.to_pickle('./test_ft_card_type.pkl')

In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm_notebook

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import re
import fasttext

# GridSearch

In [3]:
def tune(Y, X, YX, Y_int, lr, wordNgrams, epoch, kf):
    results = []
    for lr_val in lr:
        for wordNgrams_val in wordNgrams:
            for epoch_val in epoch:
                fold_results = []
                for train_index, test_index in kf.split(X, Y_int):
                    training_file = open('./train_cv.txt','w')
                    training_file.writelines(YX[train_index] + '\n')
                    training_file.close()
                    model = fasttext.FastText.train_supervised('./train_cv.txt', lr=lr_val, thread=10,
                                                               wordNgrams=wordNgrams_val, epoch=epoch_val)
                    pred = model.predict(X[test_index].tolist())
                    pred = pd.Series(pred[1]).apply(lambda x: 1 - x[0])
                    fold_results.append(roc_auc_score(Y_int[test_index], pred.values))
                mean_acc = pd.Series(fold_results).mean()
                print([lr_val, wordNgrams_val, epoch_val, mean_acc])
                results.append([lr_val, wordNgrams_val, epoch_val, mean_acc])         
    results = pd.DataFrame(results)
    results.columns = ['lr','wordNgrams','epoch','mean_acc']
    return(results)

# kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
# results = tune(Y=train['label'], X=train['card_type'], Y_int=train['label_int'], YX=train['labels_text'],
#                lr=[0.1], wordNgrams=[2], epoch=[15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25], kf=kf)

# Main 10 folds

In [4]:
train = pd.read_pickle('./tables_ft_card_type/train_ft_card_type.pkl')
train['label_int'] = (train['label'] == 'bad').astype(int)
test = pd.read_pickle('./tables_ft_card_type/test_ft_card_type.pkl')

In [6]:
0kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
dfs_5 = []
i = 0
for train_index, test_index in tqdm_notebook(kf.split(X=train['card_type'], y=train['label_int'])):
    training_file = open('./tables_ft_card_type/train_cv.txt','w')
    training_file.writelines(train['labels_text'][train_index] + '\n')
    training_file.close()

    model_ft = fasttext.FastText.train_supervised('./tables_ft_card_type/train_cv.txt', lr=0.1, wordNgrams=2,
                                                  epoch=22, seed=42, thread=10)
    pred = model_ft.predict(train['card_type'][test_index].tolist())
    pred = pd.Series(pred[1]).apply(lambda x: 1 - x[0]).values
    
    dfs_5.append(pd.DataFrame({'app_id': train['app_id'][test_index].values,
                               'scores_ft_card_type_2': pred, 'k': i},
                              index=test_index))
    i += 1
dfs_5 = pd.concat(dfs_5).sort_index()
dfs_5.to_csv('./fasttext_result_tables/ft_cv_card_type_fix.csv', index=False)

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




In [7]:
training_file = open('./tables_ft_card_type/train_cv.txt','w')
training_file.writelines(train['labels_text'] + '\n')
training_file.close()
model_ft = fasttext.FastText.train_supervised('./tables_ft_card_type/train_cv.txt', lr=0.1, wordNgrams=2,
                                              epoch=22, seed=42, thread=10)
pred = model_ft.predict(test['card_type'].tolist())
pred = pd.Series(pred[1]).apply(lambda x: 1 - x[0]).values

df_test_5 = test[['app_id']]
df_test_5['scores_ft_card_type_2'] = pred
df_test_5.to_csv('./fasttext_result_tables/ft_test_card_type_fix.csv', index=False)

______

______

______

______

______

______

______

______

______

______

______

______

______

______