In [1]:
import os.path
import pandas as pd
import numpy as np
import multiprocessing

from time import time
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from tqdm.notebook import tqdm_notebook
from time import time
from os import path
from nltk import SklearnClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score
from fasttext import train_supervised

In [2]:
def str2dict(string):
    string = string.replace("{" ,"")
    string = string.replace("}" , "")
    dictionary = {}
    for i in string.split(", "):
        keyvalue = i.split(":")
        dictionary[''.join(keyvalue[:-1]).strip(' \'')] = int(keyvalue[-1])
    return dictionary    


# CONVERTS count of distinct words TO persent of distinct words in text:
#   {'sad': 5, 'komornik': 15} -> {'sad': 0.25, 'komornik': 0.75}
def cowd2powd(dictionary):  
    d = dictionary.copy()             
    count_of_words = sum(d.values())
    for key, value in d.items():
        d[key] = value/count_of_words*100
    return d


def get_clfs(clf_names):
    clfs = []
    for clf_name in  clf_names:
        if clf_name == 'KNN':
            clfs.append(SklearnClassifier(KNeighborsClassifier(n_jobs=-1)))
        elif clf_name == 'RaF':
            clfs.append(SklearnClassifier(RandomForestClassifier(n_jobs=-1)))
        elif clf_name == 'LoR':
            clfs.append(SklearnClassifier(LogisticRegression(n_jobs=-1)))
        elif clf_name == 'MLP':
            clfs.append(SklearnClassifier(MLPClassifier()))
    return clfs


def get_probs(class_probs, class_types):
    confidence = []
    for typ in class_types:
        confidence.append(class_probs.prob(typ))
    return np.array(confidence)


def get_final_class(preds, id_typ_documents):
    final_class = -1
    votes = []
    for p in preds:
        votes.append(p.max())
    
    if votes.count(votes[0]) == len(votes):
        final_class = votes[0]
    else:
        probs = np.zeros(len(id_typ_documents))
        for p in preds: 
            probs += get_probs(p, id_typ_documents)
        probs = list(probs)
        final_class = id_typ_documents[probs.index(max(probs))]
    return final_class


def test_sklearn_clf(train_fpath, test_fpath, clfs, is_podw=False, type_cname='type'):
    fsc, ttime = [], []
    y_pred_probs = []

    df_train = pd.read_csv(train_fpath, sep=';')
    df_test = pd.read_csv(test_fpath, sep=';')

    df_train['x'] = df_train['codw'].apply(lambda x: str2dict(x))
    df_test['x'] = df_test['codw'].apply(lambda x: str2dict(x))
    
    if is_podw:
        df_train['x'] = df_train['x'].apply(lambda x: cowd2powd(x))
        df_test['x'] = df_test['x'].apply(lambda x: cowd2powd(x))

    train_val = df_train[['x', type_cname]].values.tolist()

    for clf in clfs:
        start_time = time()
        clf.train(train_val)
        ttime.append(time() - start_time)

        y_pred_probs.append(clf.prob_classify_many(df_test['x'].tolist()))
        y_pred = clf.classify_many(df_test['x'].tolist())
        
        fsc.append(f1_score(df_test[type_cname].tolist(), y_pred, average='macro'))

    id_typ_documents = sorted(list(set(df_train[type_cname])))

    y_pred_probs_concat = []
    for preds in np.array(y_pred_probs).T:
        y_pred_probs_concat.append(get_final_class(preds, id_typ_documents))
    
    fsc.append(f1_score(df_test[type_cname].tolist(), y_pred_probs_concat, average='macro'))
    ttime.append(sum(ttime))

    return fsc, ttime

In [3]:
data_path = 'data/limit_5K_per_type_order_by_id_desc'
samples = ['cleaned', 'stemmed', 'lemmatized']

codw_folder = 'codw'
split_num = 5

# clf_names = ['KNN', 'RaF', 'LoR', 'MLP']
clf_names = ['KNN', 'RaF', 'LoR']

columns = ['sample'] + [clf_name + '_fsc' for clf_name in clf_names] + ['MULTI_fsc'] + [clf_name + '_ttime' for clf_name in clf_names] + ['MULTI_ttime']

if not path.exists(f'{data_path}/20_results'):
    os.mkdir(f'{data_path}/20_results')

# CODW

In [4]:
df = pd.DataFrame(columns=columns)

for sample in tqdm_notebook(samples, desc='samples'):
    row = {'sample' : sample}

    fscs, ttimes = [], []

    for i in tqdm_notebook(range(split_num), desc='folds'):
        train_fpath = f"{data_path}/10_tt_split/{sample}/{codw_folder}/train_{i}.csv"
        test_fpath = f"{data_path}/10_tt_split/{sample}/{codw_folder}/test_{i}.csv"

        fsc, ttime = test_sklearn_clf(train_fpath, test_fpath, get_clfs(clf_names))
        fscs.append(fsc)
        ttimes.append(ttime)

    fscs = np.array(fscs).T
    ttimes = np.array(ttimes).T

    for e, clf_name in enumerate(clf_names):
        row[f'{clf_name}_fsc'] = sum(fscs[e])/split_num
        row[f'{clf_name}_ttime'] = sum(ttimes[e])/split_num

    row['MULTI_fsc'] = sum(fscs[-1])/split_num
    row['MULTI_ttime'] = sum(ttimes[-1])/split_num

    df = df.append(row, ignore_index=True)

    df.to_csv(f'{data_path}/20_results/codw.csv', index=False, sep=';')

samples:   0%|          | 0/3 [00:00<?, ?it/s]

folds:   0%|          | 0/5 [00:00<?, ?it/s]

folds:   0%|          | 0/5 [00:00<?, ?it/s]

folds:   0%|          | 0/5 [00:00<?, ?it/s]

# PODW

In [5]:
df = pd.DataFrame(columns=columns)

for sample in tqdm_notebook(samples, desc='samples'):
    row = {'sample' : sample}

    fscs, ttimes = [], []

    for i in tqdm_notebook(range(split_num), desc='folds'):
        train_fpath = f"{data_path}/10_tt_split/{sample}/{codw_folder}/train_{i}.csv"
        test_fpath = f"{data_path}/10_tt_split/{sample}/{codw_folder}/test_{i}.csv"

        fsc, ttime = test_sklearn_clf(train_fpath, test_fpath, get_clfs(clf_names), is_podw=True)
        fscs.append(fsc)
        ttimes.append(ttime)

    fscs = np.array(fscs).T
    ttimes = np.array(ttimes).T

    for e, clf_name in enumerate(clf_names):
        row[f'{clf_name}_fsc'] = sum(fscs[e])/split_num
        row[f'{clf_name}_ttime'] = sum(ttimes[e])/split_num

    row['MULTI_fsc'] = sum(fscs[-1])/split_num
    row['MULTI_ttime'] = sum(ttimes[-1])/split_num

    df = df.append(row, ignore_index=True)

    df.to_csv(f'{data_path}/20_results/podw.csv', index=False, sep=';')

samples:   0%|          | 0/3 [00:00<?, ?it/s]

folds:   0%|          | 0/5 [00:00<?, ?it/s]

folds:   0%|          | 0/5 [00:00<?, ?it/s]

folds:   0%|          | 0/5 [00:00<?, ?it/s]

# FastText

In [4]:
def get_avg_results_fasttext(params, path2sample, split_num, label):
    list_of_fsc, list_of_ttime = [], []

    for i in range(split_num):
        train_file_path = f"{path2sample}/train_{i}.csv"
        test_file_path = f"{path2sample}/test_{i}.csv"

        start_time = time()
        ft_model = train_supervised(input=train_file_path, 
                                    label=label, 
                                    epoch=params['epoch'],
                                    lr=params['lr'],
                                    loss=params['loss'],
                                    minCount=params['minCount'],
                                    wordNgrams=params['wordNgrams'],
                                    ws=params['ws'],
                                    dim=params['dim'],
                                    neg=params['neg'])
        list_of_ttime.append(time() - start_time)

        a, prec, rec = ft_model.test(test_file_path)

        list_of_fsc.append(2*prec*rec/(prec+rec))
    
    return np.mean(np.array(list_of_fsc), axis=0), np.mean(np.array(list_of_ttime)), np.std(np.array(list_of_fsc), axis=0)

In [5]:
data_path = 'data/limit_5K_per_type_order_by_id_desc'
samples = ['cleaned', 'stemmed', 'lemmatized']

split_num = 5
fasttext_folder = 'fasttext'
label = '__class__'

epoch = 10
lr = 3
loss = 'ova'
wordNgrams = 3

minCount = 1
dim = 50
ws = 3
neg = 5

In [7]:
params2test = { 'epoch': [5,10,20],
                'lr': [1,2,3],
                'loss': ['softmax', 'ova']}

# df = pd.DataFrame(columns=[key for key in params2test.keys()] + [f'{sample}_fsc' for sample in samples] + [f'{sample}_ttime' for sample in samples])
df = pd.read_csv(f'{data_path}/20_results/fasttext.csv', sep=';')

for epoch in tqdm_notebook(params2test['epoch'], desc='epoch'):
    for lr in tqdm_notebook(params2test['lr'], desc='lr'):
        for loss in tqdm_notebook(params2test['loss'], desc='loss'):

            row = {'epoch': epoch, 'lr': lr, 'loss': loss, 'wordNgrams': wordNgrams, 'minCount': minCount, 'dim': dim, 'ws': ws, 'neg': neg}
            for sample in samples:
                fsc, ttime, fsc_std = get_avg_results_fasttext(row, f"{data_path}/10_tt_split/{sample}/{fasttext_folder}", split_num, label)

                row[f'{sample}_fsc'] = fsc
                row[f'{sample}_fsc_std'] = fsc_std
                row[f'{sample}_ttime'] = ttime
            
            df = df.append(row, ignore_index=True)
            df.to_csv(f'{data_path}/20_results/fasttext.csv', index=False, sep=';')

epoch:   0%|          | 0/3 [00:00<?, ?it/s]

lr:   0%|          | 0/3 [00:00<?, ?it/s]

loss:   0%|          | 0/2 [00:00<?, ?it/s]

loss:   0%|          | 0/2 [00:00<?, ?it/s]

loss:   0%|          | 0/2 [00:00<?, ?it/s]

lr:   0%|          | 0/3 [00:00<?, ?it/s]

loss:   0%|          | 0/2 [00:00<?, ?it/s]

loss:   0%|          | 0/2 [00:00<?, ?it/s]

loss:   0%|          | 0/2 [00:00<?, ?it/s]

lr:   0%|          | 0/3 [00:00<?, ?it/s]

loss:   0%|          | 0/2 [00:00<?, ?it/s]

loss:   0%|          | 0/2 [00:00<?, ?it/s]

loss:   0%|          | 0/2 [00:00<?, ?it/s]

# Doc2Vec

In [6]:
def format_ft2pd(path):
    df = pd.read_csv(path, names=['text'], sep=';')
    df['type'] = df['text'].apply(lambda text: int(text.split()[0].replace('__class__', '')))
    df['text'] = df['text'].apply(lambda text: text.split()[1:])
    return df

In [7]:
df_res = pd.DataFrame()

clfs = {'KNN': KNeighborsClassifier(n_jobs=-1), 
        'RaF': RandomForestClassifier(n_jobs=-1), 
        'LoR': LogisticRegression(n_jobs=-1)}

In [10]:
for sample in tqdm_notebook(samples, desc='samples'):
    row = {'sample': sample, 'vector_size': 300, 'epoch': 100, 'dm': 0}

    # df = pd.read_csv(f'{data_path}/10_tt_split/{sample}.csv', sep=';')
    # df['text'] = df['text'].apply(lambda x: x.split())
    # train_tagged = df.apply(lambda r: TaggedDocument(words=r['text'], tags=[r['type']]), axis=1).tolist()

    # start_time = time()
    # model = Doc2Vec(train_tagged, vector_size=row['vector_size'], epoch=row['epoch'], dm=row['dm'], workers=multiprocessing.cpu_count())
    # row['d2v_ttime'] = (time()-start_time)/60

    d2v_ttime, d2v_ctime, fsc, ttime = [], [], [], []
    for i in tqdm_notebook(range(split_num), desc='split'):
        df_train = format_ft2pd(f'{data_path}/10_tt_split/{sample}/{fasttext_folder}/train_{i}.csv')
        df_test = format_ft2pd(f'{data_path}/10_tt_split/{sample}/{fasttext_folder}/test_{i}.csv')

        start_time = time()
        train_tagged = df_train.apply(lambda r: TaggedDocument(words=r['text'], tags=[r['type']]), axis=1).tolist()
        model = Doc2Vec(train_tagged, vector_size=row['vector_size'], epoch=row['epoch'], dm=row['dm'], workers=multiprocessing.cpu_count())
        
        df_train['vec'] = df_train['text'].apply(lambda text: model.infer_vector(text))
        df_test['vec'] = df_test['text'].apply(lambda text: model.infer_vector(text))
        d2v_ttime.append(time()-start_time)
        
        fsc_split, ttime_split = [], []
        for clf in clfs.values():
            start_time = time()
            clf.fit(df_train['vec'].tolist(), df_train['type'].tolist())
            ttime_split.append(time()-start_time)
            y_pred = clf.predict(df_test['vec'].tolist())
            fsc_split.append(f1_score(df_test['type'].tolist(), y_pred, average='macro'))

        ttime.append(ttime_split)
        fsc.append(fsc_split)

    d2v_ttime = np.mean(np.array(d2v_ttime))
    fsc_std = np.std(np.array(fsc).T, axis=1)
    fsc = np.mean(np.array(fsc).T, axis=1)
    ttime = np.mean(np.array(ttime).T, axis=1)

    for key, f, f_std, tt in zip(clfs.keys(), fsc, fsc_std, ttime):
        row[f'{key}_ttime'] = (tt + d2v_ttime)/60
        row[f'{key}_fsc'] = f
        row[f'{key}_fsc_std'] = f_std

    df_res = df_res.append(row, ignore_index=True)
    df_res.to_csv(f'{data_path}/20_results/doc2vec.csv', index=False, sep=';')

samples:   0%|          | 0/3 [00:00<?, ?it/s]

split:   0%|          | 0/5 [00:00<?, ?it/s]

split:   0%|          | 0/5 [00:00<?, ?it/s]

split:   0%|          | 0/5 [00:00<?, ?it/s]