In [11]:
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.base import BaseEstimator, TransformerMixin
import fasttext
import pandas as pd
from scipy import spatial
from scipy.spatial import distance
from resources.basicIO import InputOutput as IO
from resources.tokTT import CommentTokenizer
from sklearn import svm
from sklearn import metrics

In [12]:
class FastTextTransformer(BaseEstimator, TransformerMixin):
    """ Convert texts into their mean fastText vectors """

    def __init__(self, model):
        self.model = model

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.stack([
            np.mean([self.model[w] for w in text.split()], 0)
            for text in X
        ])

In [13]:
def classify(small_model,predictor,lines,Y):
    classifier = make_pipeline(
        FastTextTransformer(model=small_model),
        predictor
    ).fit(
        lines,
        Y
    )
    return classifier


### Load Models

In [14]:
# load models
model_N_2 = fasttext.load_model('models/ft_unsupervised_N_2.bin')
model_N_3 = fasttext.load_model('models/ft_unsupervised_N_3.bin')



In [15]:
# Load seed set and tokenize
seed_set = CommentTokenizer.cleaned("datasets/seed_set.txt")
# Load seed Labels
Y = IO.load_nums("datasets/seed_set_labels.txt")

# Load expanded seed set
seed_set_expanded_N_2 = CommentTokenizer.cleaned("datasets_post/seed_set_expanded_N_2.txt")
Y_N_2 = IO.load_nums("datasets_post/seed_set_expanded_labels_N_2.txt")

seed_set_expanded_N_3 = CommentTokenizer.cleaned("datasets_post/seed_set_expanded_N_3.txt")
Y_N_3 = IO.load_nums("datasets_post/seed_set_expanded_labels_N_3.txt")


In [16]:
# Load testing set
testing_text = IO.load_csv_col('datasets/random_sample.csv', 'comment')
testing_text = testing_text[500:1000]
testing_text_labels = IO.load_csv_col('datasets/random_sample.csv', 'label')
testing_text_labels = list(map(int, testing_text_labels[500:1000]))


## Models

### LR

In [17]:
# classification
LR_ss_N_2 = classify(model_N_2, LogisticRegression(), seed_set, Y)
LR_ss_N_3 = classify(model_N_3, LogisticRegression(), seed_set, Y)
LR_es_N_2 = classify(model_N_2, LogisticRegression(),seed_set_expanded_N_2, Y_N_2)
LR_es_N_3 = classify(model_N_3,LogisticRegression(),seed_set_expanded_N_3, Y_N_3)

### SVM

In [18]:
SVM_ss_N_2 = classify(model_N_2, svm.SVC(), seed_set, Y)
SVM_ss_N_3 = classify(model_N_3, svm.SVC(), seed_set, Y)
SVM_es_N_2 = classify(model_N_2, svm.SVC(), seed_set_expanded_N_2, Y_N_2)
SVM_es_N_3 = classify(model_N_3, svm.SVC(), seed_set_expanded_N_3, Y_N_3)


### Utility for all models

In [19]:
models = []
models.append(('LR Seed Set N=2', LR_ss_N_2))
models.append(('LR Seed Set N=3', LR_ss_N_3))
models.append(('LR Expanded Set N=2', LR_es_N_2))
models.append(('LR Expanded Set N=3', LR_es_N_3))
models.append(('SVM Seed Set N=2', SVM_ss_N_2))
models.append(('SVM Seed Set N=3', SVM_ss_N_3))
models.append(('SVM Expanded Set N=2', SVM_es_N_2))
models.append(('SVM Expanded Set N=3', SVM_es_N_3))

### Print output to file

In [20]:
outfile = open("results/output.txt", "a")
for i, v in models:
    accuracy = metrics.accuracy_score(testing_text_labels, v.predict(testing_text))
    confusion_matrix = metrics.confusion_matrix(testing_text_labels, v.predict(testing_text))
    print('========= {} Model Test Results ==========='.format(i), file=outfile) 
    print(' ',file=outfile)
    print("Model Accuracy:" "\n", accuracy, file=outfile)
    print(' ', file=outfile)
    print("Confusion matrix:" "\n", confusion_matrix, file=outfile)
    print(' ', file=outfile)
outfile.close()
