In [1]:
import numpy as np
import fasttext
import pandas as pd
import random
from resources.tokTT import CommentTokenizer as CT
from resources.basicIO import InputOutput as IO
from resources.basicIO import InputOutput as IO
from resources.filterLang import FilterLanguage as FL
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.base import BaseEstimator, TransformerMixin
import fasttext
import pandas as pd
from scipy import spatial
from scipy.spatial import distance
from sklearn import svm
from sklearn import metrics
from sklearn.linear_model import SGDClassifier
#from sklearn.linear_model import LassoLars
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import cross_val_score
import copy
import scipy

[nltk_data] Downloading package wordnet to C:\Users\AJAY
[nltk_data]     BISWAS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Preprocess Corpus and Random Sample

In [2]:
raw_corpus = IO.load_text('datasets/corpus.txt')
tokenized_corpus = CT.cleaned('datasets/corpus.txt')
IO.save_text('datasets/tokenized_corpus.txt',tokenized_corpus)

### Make Fasttext Unsupervised Model

In [3]:
model_2 = fasttext.train_unsupervised(input="datasets/tokenized_corpus.txt", lr=0.01, epoch=30, wordNgrams=2, dim=300)
model_2.save_model("models/ft_unsupervised_N_2.bin")

### Make dataframes

In [4]:
# corpus
df_dict = {'raw_comment': raw_corpus, 'tokenized_comment': tokenized_corpus}
df_corpus = pd.DataFrame(df_dict)
df_corpus.to_csv('datasets/corpus_data.csv', index=False)

In [5]:
# random sample
text = IO.load_csv_col('datasets/random_sample.csv', 'comment')
text_labels = IO.load_csv_col('datasets/random_sample.csv', 'label')
text_labels = list(map(str, map(int, text_labels)))
text_TK = [CT.tokenize(x) for x in text]

df_dict = {'raw_comment': text, 'tokenized_comment': text_TK, 'label': text_labels}
df_sample = pd.DataFrame(df_dict)
df_sample.to_csv('datasets/random_sample_data.csv', index=False)


In [6]:
# remove unnecessary comments
df_sample = df_sample.drop([x for x in range(len(df_sample)) if int(
    df_sample['label'][x]) != 0 and int(df_sample['label'][x]) != 1])


In [7]:
df_sample.tail()

Unnamed: 0,raw_comment,tokenized_comment,label
3358,Farmers are rights,farmer be right,0
3360,Spineless modi now this will boost confidence...,spineless modi now this will boost confidence ...,0
3362,These three laws are very beneficial to farmer...,these three law be very beneficial to farmer o...,1
3363,Farmers are not terrorist,farmer be not terrorist,0
3419,"We stand with Indian farmers, down with all o...",we stand with indian farmer down with all oppr...,0


In [8]:
df_sample['label'].value_counts()

0    1378
1    1040
Name: label, dtype: int64

### Train test split

In [9]:

X_train, X_test, y_train, y_test = train_test_split(df_sample['tokenized_comment'], 
                                                    df_sample['label'], test_size=0.2, 
                                                    random_state=42,
                                                    stratify=df_sample['label'])

In [10]:
print('X_train: ' ,len(X_train))
print('X_test: ' ,len(X_test))

X_train:  1934
X_test:  484


### Classification

In [11]:
class FastTextTransformer(BaseEstimator, TransformerMixin):
    """ Convert texts into their mean fastText vectors """

    def __init__(self, model):
        self.model = model

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.stack([np.mean([self.model[w] for w in text.split()], 0) for text in X])


def classify(small_model, predictor, lines, Y):
    classifier = make_pipeline(
        FastTextTransformer(model=small_model),
        predictor
    ).fit(
        lines,
        Y
    )
    return classifier

### Classifiers

In [12]:
# Logistic Regression
LR_Normal = classify(model_2, LogisticRegression(random_state=1), X_train, y_train)
# SVM
SVM_Normal = classify(model_2, svm.SVC(), X_train, y_train)


### Models

In [13]:
models = []
models.append(('LR Normal N=2', LR_Normal))
models.append(('SVM Normal N=2', SVM_Normal))


### Output.txt

In [14]:
file = open('results/output_better.txt', 'w+')
file.close()

outfile = open("results/output_better.txt", "a")
for i, v in models:
    print(i)
    accuracy = metrics.accuracy_score(y_test, v.predict(X_test))
    confusion_matrix = metrics.confusion_matrix(y_test, v.predict(X_test))
    print('========= {} Model Test Results ==========='.format(i), file=outfile)
    print(' ', file=outfile)
    print("Model Accuracy:" "\n", accuracy, file=outfile)
    print(accuracy)
    print(' ', file=outfile)
    print("Confusion matrix:" "\n", confusion_matrix, file=outfile)
    print(' ', file=outfile)
outfile.close()


LR Normal N=2
0.731404958677686
SVM Normal N=2
0.731404958677686


### TF-IDF Classification

In [15]:
text_clf = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', LogisticRegression(random_state=1)),
                    ])
text_clf = text_clf.fit(X_train, y_train)


In [16]:
predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)


0.731404958677686

### Cross Validate

In [17]:
scores = cross_val_score(text_clf, df_sample.tokenized_comment, df_sample.label, cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.74 (+/- 0.05)


### Seed Set + AVG + Batch + Random Sampling

In [18]:
X_seed, X_expand, y_seed, y_expand = train_test_split(X_train,
                                                    y_train, test_size=0.99,
                                                    random_state=41,
                                                    stratify=y_train)


In [19]:
print('X_seed: ',len(X_seed))
print('X_expand: ',len(X_expand))

X_seed:  19
X_expand:  1915


In [20]:
y_seed.value_counts()

0    11
1     8
Name: label, dtype: int64

### Cosine Similarity and Nearest Neighbor Scores

In [21]:
def score(model, line, k):
    # words contains all the words in the corpus
    lst1 = model.get_nearest_neighbors(line, k)
    v1 = []
    l1 = [x[1] for x in lst1]
    l10 = [x[0] for x in lst1]
    for i in range(len(model.words)):
        try:
            v1.append(l10[l1.index(model.words[i])])
        except:
            v1.append(0)
    return v1

def NN(model, line, K):
        return model.get_nearest_neighbors(line, k=K)

def get_NN(model, lines_TK, k):
    scores = []
    for line in lines_TK:
        scores.append(score(model, line, k))
    return scores

def cos_sim(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    if(norm_a * norm_b == 0.0):
        return dot_product / (norm_a * norm_b + 0.001)
    return dot_product / (norm_a * norm_b)

def sim(x, y, sim_type):
    if(sim_type == 'cosine_sim'):
        return cos_sim(x, y)

# find similarity score matrix between A and B
# pass transpose of B
def sim_matrix(A, B, sim_type):
    m, p = A.shape
    p, n = B.shape
    C = np.zeros((m, n))
    for i in range(m):
        for j in range(n):
            C[i][j] = sim(A[i, :], B[:, j], sim_type)
    return C


### Expansion Code (Random Sampling)

In [22]:
def Expand_R(model, seed_set_TK, seed_set_label, expansion_TK, expansion_text_labels, batch_size, count, k, random_rate):
    seed_TK = copy.deepcopy(seed_set_TK)
    seed_labels = copy.deepcopy(seed_set_label)
    M = np.arange(0, count, batch_size)
    cnt = int(random_rate * batch_size)
    count2 = [0]

    for i in range(1, len(M)):

        print(M[i], end=' ')

        exp_TK = expansion_TK[M[i-1]:M[i]]
        exp_labels = expansion_text_labels[M[i-1]:M[i]]

        seed_NN = get_NN(model, seed_TK, k)
        exp_NN = get_NN(model, exp_TK, k)

        A = np.array(seed_NN)
        B = np.array(exp_NN).T
        C = sim_matrix(A, B, "cosine_sim")

        Y_ind = np.argmax(C, axis=0)
        Y = [seed_labels[x] for x in Y_ind]

        if(random_rate == 0.0):
            # no random sampling
            pass
        else:
            # random sampling
            Y_r = random.sample(range(0,len(Y)), cnt)
            for j in Y_r:
                if(Y[j] == exp_labels[j]):
                    count2[0] += 1
                Y[j] = exp_labels[j]

        seed_labels.extend(Y)
        seed_TK.extend(exp_TK)

    return seed_TK, seed_labels, count2


In [23]:
X_train_exp, y_train_exp, count2 = Expand_R(model_2, 
                                  X_seed.to_list(), 
                                  y_seed.to_list(), 
                                  X_expand.to_list(), 
                                  y_expand.to_list(),
                                  20,
                                  len(y_expand),
                                  20,
                                  0.2
                                  )

20 40 60 80 100 120 140 160 180 200 220 240 260 280 300 320 340 360 380 400 420 440 460 480 500 520 540 560 580 600 620 640 660 680 700 720 740 760 780 800 820 840 860 880 900 920 940 960 980 1000 1020 1040 1060 1080 1100 1120 1140 1160 1180 1200 1220 1240 1260 1280 1300 1320 1340 1360 1380 1400 1420 1440 1460 1480 1500 1520 1540 1560 1580 1600 1620 1640 1660 1680 1700 1720 1740 1760 1780 1800 1820 1840 1860 1880 1900 

In [24]:
print(count2[0])

250


In [25]:
# Logistic Regression
LR_Normal_e = classify(model_2, LogisticRegression(
    random_state=1), X_train_exp, y_train_exp)
# SVM
SVM_Normal_e = classify(model_2, svm.SVC(), X_train_exp, y_train_exp)


models_e = []
models_e.append(('LR Normal N=2', LR_Normal_e))
models_e.append(('SVM Normal N=2', SVM_Normal_e))


In [26]:
file = open('results/output_better_exp.txt', 'w+')
file.close()

outfile = open("results/output_better_exp.txt", "a")
for i, v in models_e:
    print(i)
    accuracy = metrics.accuracy_score(y_test, v.predict(X_test))
    confusion_matrix = metrics.confusion_matrix(y_test, v.predict(X_test))
    print('========= {} Model Test Results ==========='.format(i), file=outfile)
    print(' ', file=outfile)
    print("Model Accuracy:" "\n", accuracy, file=outfile)
    print(accuracy)
    print(' ', file=outfile)
    print("Confusion matrix:" "\n", confusion_matrix, file=outfile)
    print(' ', file=outfile)
outfile.close()


LR Normal N=2
0.6756198347107438
SVM Normal N=2
0.6363636363636364


### Expansion Code (Active Learning)

In [27]:
def Expand_A(model, seed_set_TK, seed_set_label, expansion_TK, 
             expansion_text_labels, batch_size, count, k, sim_threshold):
    
    seed_TK = copy.deepcopy(seed_set_TK)
    seed_labels = copy.deepcopy(seed_set_label)
    M = np.arange(0, count, batch_size)

    count2 = [0]

    for i in range(1, len(M)):

        print(M[i], end=' ')

        exp_TK = expansion_TK[M[i-1]:M[i]]
        exp_labels = expansion_text_labels[M[i-1]:M[i]]

        seed_NN = get_NN(model, seed_TK, k)
        exp_NN = get_NN(model, exp_TK, k)

        A = np.array(seed_NN)
        B = np.array(exp_NN).T
        C = sim_matrix(A, B, "cosine_sim")

        Y_ind = np.argmax(C, axis=0)
        Y_val = np.amax(C, axis=0)

        #Y = [seed_labels[x] if y >= sim_threshold else exp_labels[x] for x,y in zip(Y_ind, Y_val)]
        Y = []
        for ii in range(len(Y_ind)):
            if(Y_val[ii] >= sim_threshold):
                Y.append(seed_labels[Y_ind[ii]])
            else:
                Y.append(exp_labels[ii])
                count2[0] += 1

        seed_labels.extend(Y)
        seed_TK.extend(exp_TK)

    return seed_TK, seed_labels, count2


In [28]:
X_train_exp_a, y_train_exp_a, count2 = Expand_A(model_2,
                                            X_seed.to_list(),
                                            y_seed.to_list(),
                                            X_expand.to_list(),
                                            y_expand.to_list(),
                                            20,
                                            len(y_expand),
                                            20,
                                            0.2
                                            )

20 40 60 80 100 120 140 160 180 200 220 240 260 280 300 320 340 360 380 400 420 440 460 480 500 520 540 560 580 600 620 640 660 680 700 720 740 760 780 800 820 840 860 880 900 920 940 960 980 1000 1020 1040 1060 1080 1100 1120 1140 1160 1180 1200 1220 1240 1260 1280 1300 1320 1340 1360 1380 1400 1420 1440 1460 1480 1500 1520 1540 1560 1580 1600 1620 1640 1660 1680 1700 1720 1740 1760 1780 1800 1820 1840 1860 1880 1900 

In [29]:
print(count2)

[14]


In [30]:
# Logistic Regression
LR_Normal_a = classify(model_2, LogisticRegression(
    random_state=1), X_train_exp_a, y_train_exp_a)
# SVM
SVM_Normal_a = classify(model_2, svm.SVC(), X_train_exp_a, y_train_exp_a)


models_a = []
models_a.append(('LR Normal N=2', LR_Normal_a))
models_a.append(('SVM Normal N=2', SVM_Normal_a))


In [31]:
file = open('results/output_better_active.txt', 'w+')
file.close()

outfile = open("results/output_better_active.txt", "a")
for i, v in models_a:
    print(i)
    accuracy = metrics.accuracy_score(y_test, v.predict(X_test))
    confusion_matrix = metrics.confusion_matrix(y_test, v.predict(X_test))
    print('========= {} Model Test Results ==========='.format(i), file=outfile)
    print(' ', file=outfile)
    print("Model Accuracy:" "\n", accuracy, file=outfile)
    print(accuracy)
    print(' ', file=outfile)
    print("Confusion matrix:" "\n", confusion_matrix, file=outfile)
    print(' ', file=outfile)
outfile.close()


LR Normal N=2
0.5888429752066116
SVM Normal N=2
0.5867768595041323


### Expansion Code (Uncertainty Sampling)

In [39]:
def Expand_U(model, algorithm, seed_set_TK, seed_set_label, expansion_TK,
             expansion_text_labels, batch_size, count, max_threshold):

    seed_TK = copy.deepcopy(seed_set_TK)
    seed_labels = copy.deepcopy(seed_set_label)
    M = np.arange(0, count, batch_size)

    # exp_TK_certain will be the list of comments having high proba score
    exp_TK_certain = []
    exp_TK_certain_labels = []

    for i in range(1, len(M)):

        print(M[i], end=' ')

        exp_TK = expansion_TK[M[i-1]:M[i]]
        exp_labels = expansion_text_labels[M[i-1]:M[i]]

        # take A as training and B as test and store probs in C
        v = classify(model, algorithm, seed_TK, seed_labels)
        C = v.predict_proba(exp_TK)
        C_abs_diff = [(abs(x[0] - x[1])) for x in C]

        # do sorting
        # [x for _, x in sorted(zip(Y, X))]
        exp_labels = [x for _, x in sorted(zip(C_abs_diff, exp_labels))]
        exp_TK = [x for _, x in sorted(zip(C_abs_diff, exp_TK))]
        C_sorted = [x for _, x in sorted(zip(C_abs_diff, C))]

        Y_uncertain = []
        exp_TK_uncertain = []
        for j in range(len(C_sorted)):
            max_value = max(C_sorted[j])
            max_index = str(np.argmax(C_sorted[j]))

            if(max_value <= max_threshold):
                Y_uncertain.append(max_index)
                exp_TK_uncertain.append(exp_TK[j])
            else:
                exp_TK_certain.append(exp_TK[j])
                exp_TK_certain_labels.append(exp_labels[j])

        seed_labels.extend(Y_uncertain)
        seed_TK.extend(exp_TK_uncertain)

    return seed_TK, seed_labels, exp_TK_certain, exp_TK_certain_labels


In [40]:
X_train_exp_u, y_train_exp_u, X_certain_exp_u, y_certain_exp_u = Expand_U(model_2,
                                                              LogisticRegression(random_state=1),
                                                                X_seed.to_list(),
                                                                y_seed.to_list(),
                                                                X_expand.to_list(),
                                                                y_expand.to_list(),
                                                                20,
                                                                len(y_expand),
                                                                0.8
                                                                )


20 40 60 80 100 120 140 160 180 200 220 240 260 280 300 320 340 360 380 400 420 440 460 480 500 520 540 560 580 600 620 640 660 680 700 720 740 760 780 800 820 840 860 880 900 920 940 960 980 1000 1020 1040 1060 1080 1100 1120 1140 1160 1180 1200 1220 1240 1260 1280 1300 1320 1340 1360 1380 1400 1420 1440 1460 1480 1500 1520 1540 1560 1580 1600 1620 1640 1660 1680 1700 1720 1740 1760 1780 1800 1820 1840 1860 1880 1900 

In [47]:
# Logistic Regression
LR_Normal_u = classify(model_2, LogisticRegression(
    random_state=1), X_train_exp_u, y_train_exp_u)
# SVM
SVM_Normal_u = classify(model_2, svm.SVC(), X_train_exp_u, y_train_exp_u)

models_u = []
models_u.append(('LR Normal N=2', LR_Normal_u))
models_u.append(('SVM Normal N=2', SVM_Normal_u))


In [48]:
file = open('results/output_better_uncertain.txt', 'w+')
file.close()

outfile = open("results/output_better_uncertain.txt", "a")
for i, v in models_u:
    print(i)
    accuracy = metrics.accuracy_score(y_test, v.predict(X_test))
    confusion_matrix = metrics.confusion_matrix(y_test, v.predict(X_test))
    print('========= {} Model Test Results ==========='.format(i), file=outfile)
    print(' ', file=outfile)
    print("Model Accuracy:" "\n", accuracy, file=outfile)
    print(accuracy)
    print(' ', file=outfile)
    print("Confusion matrix:" "\n", confusion_matrix, file=outfile)
    print(' ', file=outfile)
outfile.close()


LR Normal N=2
0.5702479338842975
SVM Normal N=2
0.5723140495867769
