In [37]:
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.base import BaseEstimator, TransformerMixin
import fasttext
import pandas as pd
from scipy import spatial
from scipy.spatial import distance
from resources.basicIO import InputOutput as IO
from resources.tokTT import CommentTokenizer
from sklearn import svm
from sklearn import metrics
from sklearn.linear_model import SGDClassifier
#from sklearn.linear_model import LassoLars     
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfTransformer


In [38]:
class FastTextTransformer(BaseEstimator, TransformerMixin):
    """ Convert texts into their mean fastText vectors """

    def __init__(self, model):
        self.model = model

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.stack([np.mean([self.model[w] for w in text.split()], 0) for text in X])

In [39]:
def classify(small_model,predictor,lines,Y):
    classifier = make_pipeline(
        FastTextTransformer(model=small_model),
        predictor
    ).fit(
        lines,
        Y
    )
    return classifier


### Load Models

In [40]:
# load models
model_N_2 = fasttext.load_model('models/ft_unsupervised_N_2.bin')
model_N_3 = fasttext.load_model('models/ft_unsupervised_N_3.bin')



In [41]:
# Load seed set and tokenize
seed_set = CommentTokenizer.cleaned("datasets/seed_set.txt")
# Load seed Labels
Y = IO.load_nums("datasets/seed_set_labels.txt")

# Load expanded seed set
batch_set_expanded_N_2 = CommentTokenizer.cleaned("datasets_post/batch_N_2.txt")
Y_batch_N_2 = IO.load_nums("datasets_post/batch_labels_N_2.txt")


In [42]:
# Load testing set
testing_text = IO.load_csv_col('datasets/random_sample.csv', 'comment')
testing_text = testing_text[800:1000]
testing_text_labels = IO.load_csv_col('datasets/random_sample.csv', 'label')
testing_text_labels = list(map(int, testing_text_labels[800:1000]))

training_text = IO.load_csv_col('datasets/random_sample.csv', 'comment')
training_text = training_text[0:800]
training_text_labels = IO.load_csv_col('datasets/random_sample.csv', 'label')
training_text_labels = list(map(int, training_text_labels[0:800]))


In [43]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ("clf", LogisticRegression()),
    ]
)

model = pipeline.fit(training_text, training_text_labels)


In [44]:
y_pred = model.predict(testing_text)
accuracy = metrics.accuracy_score(testing_text_labels, y_pred)
print(accuracy)

0.615


In [45]:
def transform(model, X):
    return np.stack([np.mean([model[w] for w in text.split()], 0) for text in X])

X = ["I am in support of modiji he is right"]
transform(model_N_2,X)

array([[ 0.07449013,  0.83748037, -0.14195217, -0.1837346 , -0.679526  ,
        -0.52007914,  0.4860282 ,  0.24187115, -0.25130326,  0.17439736,
        -0.7833895 , -0.0797803 , -0.17262478, -0.6947161 , -0.5898485 ,
        -0.13970783,  0.09154332, -0.5251334 , -0.59298986, -0.11000982,
         0.8115195 ,  0.34891734,  0.99510795,  0.15121597, -0.41099253,
         0.09172237, -0.22248405,  0.4202536 ,  0.11127239, -0.03859742,
         0.21756229, -0.8070128 , -0.15790884, -0.05473679, -0.50062215,
        -0.08902712,  0.5610346 ,  0.07410225, -0.40861705,  0.24336845]],
      dtype=float32)

## Models

### LR

In [46]:
# classification
LR_ss_N_2 = classify(model_N_2, LogisticRegression(
    random_state=1), seed_set, Y)
LR_ss_N_3 = classify(model_N_3, LogisticRegression(
    random_state=1), seed_set, Y)
LR_bes_N_2 = classify(model_N_2, LogisticRegression(
    random_state=1), batch_set_expanded_N_2, Y_batch_N_2)


### SVM

In [47]:
SVM_ss_N_2 = classify(model_N_2, svm.SVC(), seed_set, Y)
SVM_ss_N_3 = classify(model_N_3, svm.SVC(), seed_set, Y)
SVM_bes_N_2 = classify(model_N_2, svm.SVC(), batch_set_expanded_N_2, Y_batch_N_2)


### SGD

In [48]:
SGD_ss_N_2 = classify(model_N_2, SGDClassifier(
    loss="hinge", penalty="l1"), seed_set, Y)
SGD_ss_N_3 = classify(model_N_3, SGDClassifier(
    loss="hinge", penalty="l1"), seed_set, Y)

### LDA

In [49]:
LDA_ss_N_2 = classify(model_N_2, LinearDiscriminantAnalysis(), seed_set, Y)
LDA_ss_N_3 = classify(model_N_3, LinearDiscriminantAnalysis(), seed_set, Y)

### Decision Tree

In [50]:
DT_ss_N_2 = classify(model_N_2, DecisionTreeClassifier(), seed_set, Y)
DT_ss_N_3 = classify(model_N_3, DecisionTreeClassifier(), seed_set, Y)

### Gaussian NB

In [51]:
GNB_ss_N_2 = classify(model_N_2, GaussianNB(), seed_set, Y)
GNB_ss_N_3 = classify(model_N_3, GaussianNB(), seed_set, Y)

### Random Forest

In [52]:
RF_ss_N_2 = classify(model_N_2, RandomForestClassifier(
    n_estimators=50, random_state=1), seed_set, Y)
RF_ss_N_3 = classify(model_N_3, RandomForestClassifier(
    n_estimators=50, random_state=1), seed_set, Y)


### Utility for all models

In [53]:
models = []
models.append(('LR Batch Set N=2', LR_bes_N_2))
models.append(('SVM Batch Set N=2', SVM_bes_N_2))

### Print output to file

In [54]:
file = open('results/output.txt', 'w+')
file.close()

outfile = open("results/output.txt", "a")
for i, v in models:
    print(i)
    accuracy = metrics.accuracy_score(testing_text_labels, v.predict(testing_text))
    confusion_matrix = metrics.confusion_matrix(testing_text_labels, v.predict(testing_text))
    print('========= {} Model Test Results ==========='.format(i), file=outfile) 
    print(' ',file=outfile)
    print("Model Accuracy:" "\n", accuracy, file=outfile)
    print(' ', file=outfile)
    print("Confusion matrix:" "\n", confusion_matrix, file=outfile)
    print(' ', file=outfile)
outfile.close()


LR Batch Set N=2
SVM Batch Set N=2
