In [18]:
import numpy as np
import fasttext
import pandas as pd
from resources.tokTT import CommentTokenizer as CT
from resources.basicIO import InputOutput as IO
from resources.basicIO import InputOutput as IO
from resources.filterLang import FilterLanguage as FL
from active_expansion.fasttext_batch_avg import Expander
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.base import BaseEstimator, TransformerMixin
import fasttext
import pandas as pd
from scipy import spatial
from scipy.spatial import distance
from sklearn import svm
from sklearn import metrics
from sklearn.linear_model import SGDClassifier
#from sklearn.linear_model import LassoLars
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import cross_val_score


### Preprocess Corpus and Random Sample

In [19]:
raw_corpus = IO.load_text('datasets/corpus.txt')
tokenized_corpus = CT.cleaned('datasets/corpus.txt')
IO.save_text('datasets/tokenized_corpus.txt',tokenized_corpus)

### Make Fasttext Unsupervised Model

In [20]:
model_2 = fasttext.train_unsupervised(input="datasets/tokenized_corpus.txt", lr=0.01, epoch=30, wordNgrams=2, dim=300)
model_2.save_model("models/ft_unsupervised_N_2.bin")


### Make dataframes

In [21]:
# corpus
df_dict = {'raw_comment': raw_corpus, 'tokenized_comment': tokenized_corpus}
df_corpus = pd.DataFrame(df_dict)
df_corpus.to_csv('datasets/corpus_data.csv', index=False)

In [22]:
# random sample
text = IO.load_csv_col('datasets/random_sample.csv', 'comment')
text_labels = IO.load_csv_col('datasets/random_sample.csv', 'label')
text_labels = list(map(str, map(int, text_labels)))
text_TK = [CT.tokenize(x) for x in text]

df_dict = {'raw_comment': text, 'tokenized_comment': text_TK, 'label': text_labels}
df_sample = pd.DataFrame(df_dict)
df_sample.to_csv('datasets/random_sample_data.csv', index=False)


In [23]:
# remove unnecessary comments
df_sample = df_sample.drop([x for x in range(len(df_sample)) if int(
    df_sample['label'][x]) == 2 or int(df_sample['label'][x]) == 3])


In [24]:
df_sample.tail()

Unnamed: 0,raw_comment,tokenized_comment,label
2396,Farmers know what is good for them...govt jitn...,farmer know what be good for them govt jitna j...,0
2398,Again I think disputes are really hard to win ...,again i think dispute be really hard to win fo...,0
2399,Only thing which i got to know that Farmers sh...,only thing which i get to know that farmer sho...,1
2400,"When the farmers were happy with this bill, yo...",when the farmer be happy with this bill you ge...,1
2401,fight for farmers!,fight for farmer,0


In [25]:
df_sample['label'].value_counts()

0    1003
1     778
Name: label, dtype: int64

### Train test split

In [26]:

X_train, X_test, y_train, y_test = train_test_split(df_sample['tokenized_comment'], 
                                                    df_sample['label'], test_size=0.2, 
                                                    random_state=42,
                                                    stratify=df_sample['label'])

In [27]:
print('X_train: ' ,len(X_train))
print('X_test: ' ,len(X_test))

X_train:  1424
X_test:  357


### Use Active Learning on Training Data

### Seed Set Expansion

### Classification

In [28]:
class FastTextTransformer(BaseEstimator, TransformerMixin):
    """ Convert texts into their mean fastText vectors """

    def __init__(self, model):
        self.model = model

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.stack([np.mean([self.model[w] for w in text.split()], 0) for text in X])


def classify(small_model, predictor, lines, Y):
    classifier = make_pipeline(
        FastTextTransformer(model=small_model),
        predictor
    ).fit(
        lines,
        Y
    )
    return classifier

### Classifiers

In [29]:
# Logistic Regression
LR_Normal = classify(model_2, LogisticRegression(random_state=1), X_train, y_train)
# SVM
SVM_Normal = classify(model_2, svm.SVC(), X_train, y_train)


### Models

In [30]:
models = []
models.append(('LR Normal N=2', LR_Normal))
models.append(('SVM Normal N=2', SVM_Normal))


### Output.txt

In [31]:
file = open('results/output_better.txt', 'w+')
file.close()

outfile = open("results/output_better.txt", "a")
for i, v in models:
    print(i)
    accuracy = metrics.accuracy_score(y_test, v.predict(X_test))
    confusion_matrix = metrics.confusion_matrix(y_test, v.predict(X_test))
    print('========= {} Model Test Results ==========='.format(i), file=outfile)
    print(' ', file=outfile)
    print("Model Accuracy:" "\n", accuracy, file=outfile)
    print(accuracy)
    print(' ', file=outfile)
    print("Confusion matrix:" "\n", confusion_matrix, file=outfile)
    print(' ', file=outfile)
outfile.close()


LR Normal N=2
0.7310924369747899
SVM Normal N=2
0.7647058823529411


### TF-IDF Classification

In [32]:
text_clf = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', LogisticRegression(random_state=1)),
                    ])
text_clf = text_clf.fit(X_train, y_train)


In [33]:
predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)


0.7450980392156863

### Cross Validate

In [34]:
scores = cross_val_score(text_clf, df_sample.tokenized_comment, df_sample.label, cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.73 (+/- 0.07)
