In [95]:
import numpy as np
import fasttext
import pandas as pd
from resources.tokTT import CommentTokenizer as CT
from resources.basicIO import InputOutput as IO
from resources.basicIO import InputOutput as IO
from resources.filterLang import FilterLanguage as FL
from active_expansion.fasttext_batch_avg import Expander
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.base import BaseEstimator, TransformerMixin
import fasttext
import pandas as pd
from scipy import spatial
from scipy.spatial import distance
from sklearn import svm
from sklearn import metrics
from sklearn.linear_model import SGDClassifier
#from sklearn.linear_model import LassoLars
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfTransformer


### Preprocess Corpus and Random Sample

In [2]:
raw_corpus = IO.load_text('datasets/corpus.txt')
tokenized_corpus = CT.cleaned('datasets/corpus.txt')
IO.save_text('datasets/tokenized_corpus.txt',tokenized_corpus)

### Make Fasttext Unsupervised Model

In [3]:
model_2 = fasttext.train_unsupervised(input="datasets/tokenized_corpus.txt", lr=0.01, epoch=10, wordNgrams=2, dim=100)
model_2.save_model("models/ft_unsupervised_N_2.bin")


### Make dataframes

In [4]:
# corpus
df_dict = {'raw_comment': raw_corpus, 'tokenized_comment': tokenized_corpus}
df_corpus = pd.DataFrame(df_dict)
df_corpus.to_csv('datasets/corpus_data.csv', index=False)

In [114]:
# random sample
text = IO.load_csv_col('datasets/random_sample.csv', 'comment')
text_labels = IO.load_csv_col('datasets/random_sample.csv', 'label')
text_labels = list(map(str, map(int, text_labels)))
text_TK = [CT.tokenize(x) for x in text]

df_dict = {'raw_comment': text, 'tokenized_comment': text_TK, 'label': text_labels}
df_sample = pd.DataFrame(df_dict)
df_sample.to_csv('datasets/random_sample_data.csv', index=False)


In [115]:
df_sample = df_sample[0:1500]

In [116]:
df_sample.tail()

Unnamed: 0,raw_comment,tokenized_comment,label
1495,This govt is pro corporate in every sense..and...,this govt be pro corporate in every sense and ...,0
1496,"Not to stir up a hornet's nest here, but can s...",not to stir up a hornet be nest here but can s...,1
1497,Lets make 1 million likes on this video for sa...,let make 1 million like on this video for samd...,2
1498,This is actually a great video where you are t...,this be actually a great video where you be te...,2
1499,They all are not farmers ...I know how behaves...,they all be not farmer i know how behave farme...,1


In [118]:
df_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   raw_comment        1500 non-null   object
 1   tokenized_comment  1500 non-null   object
 2   label              1500 non-null   object
dtypes: object(3)
memory usage: 35.3+ KB


In [120]:
df_sample = df_sample.drop([x for x in range(1500) if int(df_sample['label'][x]) == 2] )

In [121]:
df_sample.tail()

Unnamed: 0,raw_comment,tokenized_comment,label
1492,Rakesh diker congress ka chamcha,rakesh diker congress ka chamcha,1
1493,Farmers of Punjab get free water and electrici...,farmer of punjab get free water and electricit...,1
1495,This govt is pro corporate in every sense..and...,this govt be pro corporate in every sense and ...,0
1496,"Not to stir up a hornet's nest here, but can s...",not to stir up a hornet be nest here but can s...,1
1499,They all are not farmers ...I know how behaves...,they all be not farmer i know how behave farme...,1


### Train test split

In [136]:

X_train, X_test, y_train, y_test = train_test_split(df_sample['tokenized_comment'], 
                                                    df_sample['label'], test_size=0.2, 
                                                    random_state=42,
                                                    stratify=df_sample['label'])

### Use Active Learning on Training Data

### Seed Set Expansion

### Classification

In [137]:
class FastTextTransformer(BaseEstimator, TransformerMixin):
    """ Convert texts into their mean fastText vectors """

    def __init__(self, model):
        self.model = model

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.stack([np.mean([self.model[w] for w in text.split()], 0) for text in X])


def classify(small_model, predictor, lines, Y):
    classifier = make_pipeline(
        FastTextTransformer(model=small_model),
        predictor
    ).fit(
        lines,
        Y
    )
    return classifier

### Classifiers

In [138]:
# Logistic Regression
LR_Normal = classify(model_2, LogisticRegression(random_state=1), X_train, y_train)
# SVM
SVM_Normal = classify(model_2, svm.SVC(), X_train, y_train)


### Models

In [139]:
models = []
models.append(('LR Normal N=2', LR_Normal))
models.append(('SVM Normal N=2', SVM_Normal))


### Output.txt

In [148]:
file = open('results/output_better.txt', 'w+')
file.close()

outfile = open("results/output_better.txt", "a")
for i, v in models:
    print(i)
    accuracy = metrics.accuracy_score(y_test, v.predict(X_test))
    confusion_matrix = metrics.confusion_matrix(y_test, v.predict(X_test))
    print(confusion_matrix.shape)
    print('========= {} Model Test Results ==========='.format(i), file=outfile)
    print(' ', file=outfile)
    print("Model Accuracy:" "\n", accuracy, file=outfile)
    print(' ', file=outfile)
    print("Confusion matrix:" "\n", confusion_matrix, file=outfile)
    print(' ', file=outfile)
outfile.close()


LR Normal N=2
(2, 2)
SVM Normal N=2
(2, 2)


In [149]:
text_clf = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', LogisticRegression(random_state=1)),
                    ])
text_clf = text_clf.fit(X_train, y_train)


In [150]:
predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)


0.722007722007722