In [14]:
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.base import BaseEstimator, TransformerMixin
import fasttext
import pandas as pd
from scipy import spatial
from scipy.spatial import distance
from resources.basicIO import InputOutput as IO
from resources.tokTT import CommentTokenizer
from sklearn import svm

In [15]:
class FastTextTransformer(BaseEstimator, TransformerMixin):
    """ Convert texts into their mean fastText vectors """

    def __init__(self, model):
        self.model = model

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.stack([
            np.mean([self.model[w] for w in text.split()], 0)
            for text in X
        ])

In [16]:
def classify(small_model,predictor,lines,Y):
    classifier = make_pipeline(
        FastTextTransformer(model=small_model),
        predictor
    ).fit(
        lines,
        Y
    )
    return classifier


### Load Models

In [17]:
# load models
model_N_2 = fasttext.load_model('models/ft_unsupervised_N_2.bin')
model_N_3 = fasttext.load_model('models/ft_unsupervised_N_3.bin')



In [18]:
# Load seed set and tokenize
seed_set = CommentTokenizer.cleaned("datasets/seed_set.txt")
# Load seed Labels
Y = IO.load_nums("datasets/seed_set_labels.txt")

# Load expanded seed set
seed_set_expanded_N_2 = CommentTokenizer.cleaned("datasets_post/seed_set_expanded_N_2.txt")
Y_N_2 = IO.load_nums("datasets_post/seed_set_expanded_labels_N_2.txt")

seed_set_expanded_N_3 = CommentTokenizer.cleaned("datasets_post/seed_set_expanded_N_3.txt")
Y_N_3 = IO.load_nums("datasets_post/seed_set_expanded_labels_N_3.txt")


In [19]:
# Load testing set
testing_text = IO.load_csv_col('datasets/random_sample.csv', 'comment')
testing_text = testing_text[100:200]
testing_text_labels = IO.load_csv_col('datasets/random_sample.csv', 'label')
testing_text_labels = list(map(int, testing_text_labels[100:200]))


In [20]:
# classification
classifier_N_2_seed_set_LR = classify(model_N_2, LogisticRegression(), seed_set, Y)
classifier_N_3_seed_set_LR = classify(model_N_3, LogisticRegression(), seed_set, Y)
classifier_N_2_expanded_set_LR = classify(model_N_2, LogisticRegression(), seed_set_expanded_N_2, Y_N_2)
classifier_N_3_expanded_set_LR = classify(model_N_3,LogisticRegression(),seed_set_expanded_N_3, Y_N_3)

### Accuracy

In [21]:
def accuracy_rate(X, Y):
    count = 0
    for i in range(len(X)):
        if(X[i] == Y[i]):
            count = count + 1
    return (count/len(X))*100

In [22]:
seed_set_N_2_result = classifier_N_2_seed_set_LR.predict(testing_text)
seed_set_N_2_accuracy = accuracy_rate(testing_text_labels,seed_set_N_2_result)
print('Predicted')
print(seed_set_N_2_result)
print('Original')
print(testing_text_labels)
print(seed_set_N_2_accuracy)

Predicted
[0 0 1 1 0 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 0 0 0 1 0 0 0 1 0 0 1 1 1 0
 1 0 0 0 0 1 0 1 1 1 1 0 1 0 1 0 0 1 0 1 0 0 1 0 0 0 0 0 0 1 0 0 1 1 1 0 0
 0 0 1 1 1 1 0 0 1 1 1 0 0 1 0 0 1 0 0 0 1 1 0 1 1 0]
Original
[1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0]
60.0


In [23]:
seed_set_N_3_result = classifier_N_3_seed_set_LR.predict(testing_text)
seed_set_N_3_accuracy = accuracy_rate(testing_text_labels,seed_set_N_3_result)
print(seed_set_N_3_accuracy)

61.0


In [24]:
classifier_N_2_expanded_set_result = classifier_N_2_expanded_set_LR.predict(
    testing_text)
classifier_N_2_expanded_set_accuracy = accuracy_rate(testing_text_labels,classifier_N_2_expanded_set_result)
print(classifier_N_2_expanded_set_accuracy)

67.0


In [25]:
classifier_N_3_expanded_set_result = classifier_N_3_expanded_set_LR.predict(testing_text)
classifier_N_3_expanded_set_accuracy = accuracy_rate(testing_text_labels,classifier_N_3_expanded_set_result)
print(classifier_N_3_expanded_set_accuracy)

67.0


In [26]:
# SVM
classifier_N_3_expanded_set_SVM = classify(model_N_3,svm.SVC(),seed_set_expanded_N_3, Y_N_3)
classifier_N_3_expanded_set_result_SVM = classifier_N_3_expanded_set_SVM.predict(
    testing_text)
classifier_N_3_expanded_set_accuracy_SVM = accuracy_rate(testing_text_labels, classifier_N_3_expanded_set_result_SVM)
print(classifier_N_3_expanded_set_accuracy_SVM)


67.0
