## Train fastText

1. Install fasttext
2. Read in texts and labels to separate lists
3. Create train, test file for fasttext
4. Train fasttext model
5. Save the model 
6. Evaluate the model

In [None]:
import fasttext

train_file = 'train.txt'
test_file = 'test.txt'

labels,  texts = [], []
with open(train_file, 'r', encoding='utf-8') as f: 
    for line in f: 
        line = line.strip().split('\t')
        labels.append(line[0].split())
        texts.append(line[1])

with open('fasttext.train', 'w', encoding='utf-8') as f: 
    for labels, text in zip(labels, texts): 
        new_text = []
        for label in labels: 
            label = str(label)
            line = '__label__' + label
            new_text.append(line)
        new_text.append(text)
        text = ' '.join(new_text)
        f.write(text + '\n')

testlabels,  testtexts = [], []
with open(test_file, 'r', encoding='utf-8') as f: 
    for line in f: 
        line = line.strip().split('\t')
        testlabels.append(line[0].split())
        testtexts.append(line[1])

with open('fasttext.test', 'w', encoding='utf-8') as f: 
    for labels, text in zip(testlabels, testtexts): 
        new_text = []
        for label in labels: 
            label = str(label)
            line = '__label__' + label
            new_text.append(line)
        new_text.append(text)
        text = ' '.join(new_text)
        f.write(text + '\n')

In [None]:
model = fasttext.train_supervised(input="fasttext.train", lr=0.5, epoch=200, wordNgrams=2, bucket=200000, dim=150, loss='ova')

In [None]:
model.save_model("fasttext_model.bin")

In [None]:
preds = []
for label, text in zip(testlabels, testtexts):
    pred = model.predict(text,  k=-1, threshold=0.5)
    preds.append(pred[0])

In [None]:
label_dic = {}
i = 0
for label in labels: 
    for lab in label: 
        if str(lab) not in label_dic.keys(): 
            label_dic[str(lab)] = i
            i += 1
for label in preds: 
    for lab in label: 
        lab = lab.replace('__label__', '') 
        if lab not in label_dic.keys(): 
            label_dic[lab] = i
            i+=1

In [None]:
import numpy as np
y_true = [[label_dic[str(lab)]  for lab in label] for label in labels]
y_pred = [[label_dic[lab.replace('__label__', '')] for lab in label] for label in preds]

y_true1 = np.asarray([ np.asarray([1 if i in lab else 0 for i in range(len(label_dic.items()))]) for lab in y_true])
y_pred1 = np.asarray([np.asarray([1 if i in lab else 0 for i in range(len(label_dic.items())) ]) for lab in y_pred])

In [None]:
from sklearn.metrics import classification_report
classification_report(
    y_true1,
    y_pred1,
    output_dict=False,
    target_names=[ str(i) for i in range(len(label_dic.keys()))]
)

## Train sklearn models

In [None]:
from matplotlib import pyplot as plt
from sklearn.base import BaseEstimator
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import ParameterGrid
import pandas as pd
from tqdm import tqdm

In [None]:
class ClfSwitcher(BaseEstimator):
    
    def __init__(self, estimator=RandomForestClassifier()):
        """
        A Custom BaseEstimator that can switch between classifiers.
        
        Parameters
        ----------
        estimator: sklearn object, the classifier
        """
        self.estimator = estimator
    
    def fit(self, X, y):
        self.estimator.fit(X, y)
        return self
    
    def predict(self, X):
        return self.estimator.predict(X)
    
    def predict_proba(self, X):
        return self.estimator.predict_proba(X)
    
    def score(self, X, y):
        return self.estimator.score(X, y)

In [None]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', ClfSwitcher())
])

In [None]:
RS = 47
grid = ParameterGrid({
    'clf__estimator': [
        MultiOutputClassifier(LogisticRegression(class_weight='balanced', random_state=RS), n_jobs=-1),
        MultiOutputClassifier(SGDClassifier(class_weight='balanced', random_state=RS, loss='modified_huber'), n_jobs=-1),
        MultiOutputClassifier(LinearSVC(class_weight='balanced', random_state=RS), n_jobs=-1),
        KNeighborsClassifier(n_jobs=-1),
        RandomForestClassifier(class_weight='balanced', random_state=RS, n_jobs=-1)
    ],
    'tfidf__ngram_range': [(1,1), (1,2)]
})

models = [
    'logreg1', 'logreg2', 'sgd1', 'sgd2', 'svm1', 'svm2', 'knn1', 'knn2', 'rf1', 'rf2'
]

In [None]:
import numpy as np
y_true_train = [[label_dic[str(lab)]  for lab in label] for label in labels]

y_true1_train = np.asarray([ np.asarray([1 if i in lab else 0 for i in range(len(label_dic.items()))]) for lab in y_true_train])


In [None]:
from sklearn.metrics import precision_recall_fscore_support
scores = pd.DataFrame()
def score(y_true, y_pred, index):
    """Calculate precision, recall, and f1 score"""
    
    metrics = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    performance = {'precision': metrics[0], 'recall': metrics[1], 'f1': metrics[2]}
    return pd.DataFrame(performance, index=[index])



for model, params in tqdm(zip(models, grid), total=len(models)):
    pipeline.set_params(**params)
    pipeline.fit(texts, y_true1_train)
    y_pred = pipeline.predict(testtexts)
    machine_learning = score(y_true1, y_pred, model)
    scores = pd.concat([scores, machine_learning])
    
scores