In [1]:
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.base import BaseEstimator, TransformerMixin
import fasttext
import pandas as pd
from scipy import spatial
from scipy.spatial import distance
from resources.basicIO import InputOutput as IO
from resources.tokTT import CommentTokenizer
from sklearn import svm
from sklearn import metrics
from sklearn.linear_model import SGDClassifier
#from sklearn.linear_model import LassoLars     
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfTransformer


[nltk_data] Downloading package wordnet to C:\Users\AJAY
[nltk_data]     BISWAS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
class FastTextTransformer(BaseEstimator, TransformerMixin):
    """ Convert texts into their mean fastText vectors """

    def __init__(self, model):
        self.model = model

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.stack([np.mean([self.model[w] for w in text.split()], 0) for text in X])

In [3]:
def classify(small_model,predictor,lines,Y):
    classifier = make_pipeline(
        FastTextTransformer(model=small_model),
        predictor
    ).fit(
        lines,
        Y
    )
    return classifier


### Load Models

In [4]:
# load models
model_N_2 = fasttext.load_model('models/ft_unsupervised_N_2.bin')
model_N_3 = fasttext.load_model('models/ft_unsupervised_N_3.bin')



In [5]:
# Load seed set and tokenize
seed_set = CommentTokenizer.cleaned("datasets/seed_set.txt")
# Load seed Labels
Y = IO.load_nums("datasets/seed_set_labels.txt")

# Load expanded seed set
seed_set_expanded_N_2 = CommentTokenizer.cleaned("datasets_post/seed_set_expanded_N_2.txt")
Y_N_2 = IO.load_nums("datasets_post/seed_set_expanded_labels_N_2.txt")

seed_set_expanded_N_3 = CommentTokenizer.cleaned("datasets_post/seed_set_expanded_N_3.txt")
Y_N_3 = IO.load_nums("datasets_post/seed_set_expanded_labels_N_3.txt")

# Load expanded seed set
batch_set_expanded_N_2 = CommentTokenizer.cleaned("datasets_post/batch_N_2.txt")
Y_batch_N_2 = IO.load_nums("datasets_post/batch_labels_N_2.txt")


In [6]:
# Load testing set
testing_text = IO.load_csv_col('datasets/random_sample.csv', 'comment')
testing_text = testing_text[500:1000]
testing_text_labels = IO.load_csv_col('datasets/random_sample.csv', 'label')
testing_text_labels = list(map(int, testing_text_labels[500:1000]))

training_text = IO.load_csv_col('datasets/random_sample.csv', 'comment')
training_text = training_text[0:500]
training_text_labels = IO.load_csv_col('datasets/random_sample.csv', 'label')
training_text_labels = list(map(int, training_text_labels[0:500]))


In [7]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ("clf", LogisticRegression()),
    ]
)

model = pipeline.fit(training_text, training_text_labels)


In [8]:
y_pred = model.predict(testing_text)
accuracy = metrics.accuracy_score(testing_text_labels, y_pred)
print(accuracy)

0.652


In [9]:
def transform(model, X):
    return np.stack([np.mean([model[w] for w in text.split()], 0) for text in X])

X = ["I am in support of modiji he is right"]
transform(model_N_2,X)

array([[-0.8569376 ,  0.17627913, -0.26320988,  0.6195948 ,  0.40598595,
         0.06675431, -0.1140122 ,  0.11172426, -0.26477864,  0.7313269 ,
         0.26581547,  0.41145974,  0.09487168, -0.442374  , -0.7185457 ,
        -0.27267113,  0.43859398,  0.4779304 , -0.14041455,  0.0945349 ,
         0.39918244, -0.10063259,  0.20902435,  0.2797713 ,  0.93947506,
        -0.09276822,  0.31877443, -0.83058906, -0.8070138 ,  0.24163577,
        -0.70087826,  0.45258236, -0.6021862 , -0.03425203, -0.5246288 ,
         0.07123923, -0.20783737, -0.1182274 ,  0.00873116, -0.47064364]],
      dtype=float32)

## Models

### LR

In [10]:
# classification
LR_ss_N_2 = classify(model_N_2, LogisticRegression(
    random_state=1), seed_set, Y)
LR_ss_N_3 = classify(model_N_3, LogisticRegression(
    random_state=1), seed_set, Y)
LR_es_N_2 = classify(model_N_2, LogisticRegression(
    random_state=1), seed_set_expanded_N_2, Y_N_2)
LR_es_N_3 = classify(model_N_3,LogisticRegression(
    random_state=1),seed_set_expanded_N_3, Y_N_3)


### SVM

In [11]:
SVM_ss_N_2 = classify(model_N_2, svm.SVC(), seed_set, Y)
SVM_ss_N_3 = classify(model_N_3, svm.SVC(), seed_set, Y)
SVM_es_N_2 = classify(model_N_2, svm.SVC(), seed_set_expanded_N_2, Y_N_2)
SVM_es_N_3 = classify(model_N_3, svm.SVC(), seed_set_expanded_N_3, Y_N_3)
SVM_bes_N_2 = classify(model_N_2, svm.SVC(), batch_set_expanded_N_2, Y_batch_N_2)


### SGD

In [12]:
SGD_ss_N_2 = classify(model_N_2, SGDClassifier(
    loss="hinge", penalty="l1"), seed_set, Y)
SGD_ss_N_3 = classify(model_N_3, SGDClassifier(
    loss="hinge", penalty="l1"), seed_set, Y)
SGD_es_N_2 = classify(model_N_2, SGDClassifier(
    loss="hinge", penalty="l1"), seed_set_expanded_N_2, Y_N_2)
SGD_es_N_3 = classify(model_N_3, SGDClassifier(
    loss="hinge", penalty="l1"), seed_set_expanded_N_3, Y_N_3)


### LDA

In [13]:
LDA_ss_N_2 = classify(model_N_2, LinearDiscriminantAnalysis(), seed_set, Y)
LDA_ss_N_3 = classify(model_N_3, LinearDiscriminantAnalysis(), seed_set, Y)
LDA_es_N_2 = classify(model_N_2, LinearDiscriminantAnalysis(),seed_set_expanded_N_2, Y_N_2)
LDA_es_N_3 = classify(model_N_3, LinearDiscriminantAnalysis(),seed_set_expanded_N_3, Y_N_3)

### Decision Tree

In [14]:
DT_ss_N_2 = classify(model_N_2, DecisionTreeClassifier(), seed_set, Y)
DT_ss_N_3 = classify(model_N_3, DecisionTreeClassifier(), seed_set, Y)
DT_es_N_2 = classify(model_N_2, DecisionTreeClassifier(),
                      seed_set_expanded_N_2, Y_N_2)
DT_es_N_3 = classify(model_N_3, DecisionTreeClassifier(),seed_set_expanded_N_3, Y_N_3)

### Gaussian NB

In [15]:
GNB_ss_N_2 = classify(model_N_2, GaussianNB(), seed_set, Y)
GNB_ss_N_3 = classify(model_N_3, GaussianNB(), seed_set, Y)
GNB_es_N_2 = classify(model_N_2, GaussianNB(),
                      seed_set_expanded_N_2, Y_N_2)
GNB_es_N_3 = classify(model_N_3, GaussianNB(),seed_set_expanded_N_3, Y_N_3)

### Random Forest

In [16]:
RF_ss_N_2 = classify(model_N_2, RandomForestClassifier(
    n_estimators=50, random_state=1), seed_set, Y)
RF_ss_N_3 = classify(model_N_3, RandomForestClassifier(
    n_estimators=50, random_state=1), seed_set, Y)
RF_es_N_2 = classify(model_N_2, RandomForestClassifier(
    n_estimators=50, random_state=1),seed_set_expanded_N_2, Y_N_2)
RF_es_N_3 = classify(model_N_3, RandomForestClassifier(
    n_estimators=50, random_state=1), seed_set_expanded_N_3, Y_N_3)


### Utility for all models

In [17]:
models = []
models.append(('Gaussian NB Seed Set N=2', GNB_ss_N_2))
models.append(('Gaussian NB Seed Set N=3', GNB_ss_N_3))
models.append(('Gaussian NB Expanded Set N=2', GNB_es_N_2))
models.append(('Gaussian NB Expanded Set N=3', GNB_es_N_3))
models.append(('LR Seed Set N=2', LR_ss_N_2))
models.append(('LR Seed Set N=3', LR_ss_N_3))
models.append(('LR Expanded Set N=2', LR_es_N_2))
models.append(('LR Expanded Set N=3', LR_es_N_3))
models.append(('SVM Seed Set N=2', SVM_ss_N_2))
models.append(('SVM Seed Set N=3', SVM_ss_N_3))
models.append(('SVM Expanded Set N=2', SVM_es_N_2))
models.append(('SVM Expanded Set N=3', SVM_es_N_3))
models.append(('SVM Batch Set N=2', SVM_bes_N_2))
models.append(('SGD Seed Set N=2', SGD_ss_N_2))
models.append(('SGD Seed Set N=3', SGD_ss_N_3))
models.append(('SGD Expanded Set N=2', SGD_es_N_2))
models.append(('SGD Expanded Set N=3', SGD_es_N_3))
models.append(('LDA Seed Set N=2', LDA_ss_N_2))
models.append(('LDA Seed Set N=3', LDA_ss_N_3))
models.append(('LDA Expanded Set N=2', LDA_es_N_2))
models.append(('LDA Expanded Set N=3', LDA_es_N_3))
models.append(('Decision Trees Seed Set N=2', DT_ss_N_2))
models.append(('Decision Trees Seed Set N=3', DT_ss_N_3))
models.append(('Decision Trees Expanded Set N=2', DT_es_N_2))
models.append(('Decision Trees Expanded Set N=3', DT_es_N_3))
models.append(('Random Forest Seed Set N=2', RF_ss_N_2))
models.append(('Random Forest Seed Set N=3', RF_ss_N_3))
models.append(('Random Forest Expanded Set N=2', RF_es_N_2))
models.append(('Random Forest Expanded Set N=3', RF_es_N_3))


### Print output to file

In [18]:
file = open('results/output.txt', 'w+')
file.close()

outfile = open("results/output.txt", "a")
for i, v in models:
    print(i)
    accuracy = metrics.accuracy_score(testing_text_labels, v.predict(testing_text))
    confusion_matrix = metrics.confusion_matrix(testing_text_labels, v.predict(testing_text))
    print('========= {} Model Test Results ==========='.format(i), file=outfile) 
    print(' ',file=outfile)
    print("Model Accuracy:" "\n", accuracy, file=outfile)
    print(' ', file=outfile)
    print("Confusion matrix:" "\n", confusion_matrix, file=outfile)
    print(' ', file=outfile)
outfile.close()


Gaussian NB Seed Set N=2
Gaussian NB Seed Set N=3
Gaussian NB Expanded Set N=2
Gaussian NB Expanded Set N=3
LR Seed Set N=2
LR Seed Set N=3
LR Expanded Set N=2
LR Expanded Set N=3
SVM Seed Set N=2
SVM Seed Set N=3
SVM Expanded Set N=2
SVM Expanded Set N=3
SVM Batch Set N=2
SGD Seed Set N=2
SGD Seed Set N=3
SGD Expanded Set N=2
SGD Expanded Set N=3
LDA Seed Set N=2
LDA Seed Set N=3
LDA Expanded Set N=2
LDA Expanded Set N=3
Decision Trees Seed Set N=2
Decision Trees Seed Set N=3
Decision Trees Expanded Set N=2
Decision Trees Expanded Set N=3
Random Forest Seed Set N=2
Random Forest Seed Set N=3
Random Forest Expanded Set N=2
Random Forest Expanded Set N=3
