In [42]:
import numpy as np
import fasttext
import pandas as pd
import random
from resources.tokTT import CommentTokenizer as CT
from resources.basicIO import InputOutput as IO
from resources.basicIO import InputOutput as IO
from resources.filterLang import FilterLanguage as FL
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.base import BaseEstimator, TransformerMixin
import fasttext
import pandas as pd
from scipy import spatial
from scipy.spatial import distance
from sklearn import svm
from sklearn import metrics
from sklearn.linear_model import SGDClassifier
#from sklearn.linear_model import LassoLars
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
import copy
import scipy


### Preprocess Corpus and Random Sample

In [2]:
raw_corpus = IO.load_text('datasets/corpus.txt')
tokenized_corpus = CT.cleaned('datasets/corpus.txt')
IO.save_text('datasets/tokenized_corpus.txt',tokenized_corpus)

### Make Fasttext Unsupervised Model

In [3]:
model_2 = fasttext.train_unsupervised(
    input="datasets/tokenized_corpus.txt", lr=0.01, epoch=30, wordNgrams=2, dim=300)
model_2.save_model("models/ft_unsupervised_N_2.bin")


### Make dataframes

In [4]:
# corpus
df_dict = {'raw_comment': raw_corpus, 'tokenized_comment': tokenized_corpus}
df_corpus = pd.DataFrame(df_dict)
df_corpus.to_csv('datasets/corpus_data.csv', index=False)

# random sample
text = IO.load_csv_col('datasets/random_sample.csv', 'comment')
text_labels = IO.load_csv_col('datasets/random_sample.csv', 'label')
text_labels = list(map(str, map(int, text_labels)))
text_TK = [CT.tokenize(x) for x in text]

df_dict = {'raw_comment': text,
           'tokenized_comment': text_TK, 'label': text_labels}
df_sample = pd.DataFrame(df_dict)
df_sample.to_csv('datasets/random_sample_data.csv', index=False)


### Remove Unnecessary Comments

In [5]:
# remove unnecessary comments
df_sample = df_sample.drop([x for x in range(len(df_sample)) if int(
    df_sample['label'][x]) != 0 and int(df_sample['label'][x]) != 1])

In [6]:
df_sample.tail()


Unnamed: 0,raw_comment,tokenized_comment,label
3448,These new agri. laws are not feasible in india...,these new agri law be not feasible in india fi...,0
3450,"If this is Farmer protest , why we see only pu...",if this be farmer protest why we see only punj...,1
3451,UP police is trying to clear their image in th...,up police be try to clear their image in the w...,0
3452,Support farmers,support farmer,0
3453,"I agree to many points you made, but MSP is a ...",i agree to many point you make but msp be a do...,1


In [7]:
df_sample['label'].value_counts()


0    1408
1    1074
Name: label, dtype: int64

### Train test split

In [8]:

X_train, X_test, y_train, y_test = train_test_split(df_sample['tokenized_comment'],
                                                    df_sample['label'], test_size=0.2,
                                                    random_state=42,
                                                    stratify=df_sample['label'])


In [9]:
print('X_train: ' ,len(X_train))
print('X_test: ' ,len(X_test))

X_train:  1985
X_test:  497


### Classification

In [10]:
class FastTextTransformer(BaseEstimator, TransformerMixin):
    """ Convert texts into their mean fastText vectors """

    def __init__(self, model):
        self.model = model

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.stack([np.mean([self.model[w] for w in text.split()], 0) for text in X])


def classify(small_model, predictor, lines, Y):
    classifier = make_pipeline(
        FastTextTransformer(model=small_model),
        predictor
    ).fit(
        lines,
        Y
    )
    return classifier


### Classifier

In [11]:
# Logistic Regression
LR_Normal = classify(model_2, LogisticRegression(
    random_state=1), X_train, y_train)
# SVM
SVM_Normal = classify(model_2, svm.SVC(), X_train, y_train)

models = []
models.append(('LR Normal N=2', LR_Normal))
models.append(('SVM Normal N=2', SVM_Normal))


In [12]:
file = open('results/output_better.txt', 'w+')
file.close()

outfile = open("results/output_better.txt", "a")
for i, v in models:
    print(i)
    accuracy = metrics.accuracy_score(y_test, v.predict(X_test))
    confusion_matrix = metrics.confusion_matrix(y_test, v.predict(X_test))
    print('========= {} Model Test Results ==========='.format(i), file=outfile)
    print(' ', file=outfile)
    print("Model Accuracy:" "\n", accuracy, file=outfile)
    print(accuracy)
    print(' ', file=outfile)
    print("Confusion matrix:" "\n", confusion_matrix, file=outfile)
    print(' ', file=outfile)
outfile.close()


LR Normal N=2
0.7505030181086519
SVM Normal N=2
0.7484909456740443


### Active Learning

In [13]:
X_seed, X_expand, y_seed, y_expand = train_test_split(X_train,
                                                      y_train, test_size=0.99,
                                                      random_state=41,
                                                      stratify=y_train)


In [14]:
print('X_seed: ',len(X_seed))
print('X_expand: ',len(X_expand))

X_seed:  19
X_expand:  1966


In [15]:
y_seed.value_counts()


0    11
1     8
Name: label, dtype: int64

### Cosine Similarity And Nearest Neighbors

In [16]:
def score(model, line, k):
    # words contains all the words in the corpus
    lst1 = model.get_nearest_neighbors(line, k)
    v1 = []
    l1 = [x[1] for x in lst1]
    l10 = [x[0] for x in lst1]
    for i in range(len(model.words)):
        try:
            v1.append(l10[l1.index(model.words[i])])
        except:
            v1.append(0)
    return v1


def NN(model, line, K):
    return model.get_nearest_neighbors(line, k=K)


def get_NN(model, lines_TK, k):
    scores = []
    for line in lines_TK:
        scores.append(score(model, line, k))
    return scores


def cos_sim(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    if(norm_a * norm_b == 0.0):
        return dot_product / (norm_a * norm_b + 0.001)
    return dot_product / (norm_a * norm_b)


def sim(x, y, sim_type):
    if(sim_type == 'cosine_sim'):
        return cos_sim(x, y)

# find similarity score matrix between A and B
# pass transpose of B


def sim_matrix(A, B, sim_type):
    m, p = A.shape
    p, n = B.shape
    C = np.zeros((m, n))
    for i in range(m):
        for j in range(n):
            C[i][j] = sim(A[i, :], B[:, j], sim_type)
    return C


### Expansion Code (Random Sampling)

In [46]:
def Expand_R(model: fasttext.FastText._FastText, 
             seed_set_tokenised:list, 
             seed_set_label:list, 
             expansion_tokenised:list, 
             expansion_set_labels:list, 
             batch_size:int, 
             k_neighbors:int, 
             random_rate:float):
             
    seed_TK = copy.deepcopy(seed_set_tokenised)
    seed_labels = copy.deepcopy(seed_set_label)
    count = len(expansion_set_labels)
    M = np.arange(0, count, batch_size)
    cnt = int(random_rate * batch_size)
    count2 = [0]
    
    expansion_predicted_labels = []
    expansion_true_labels = []
    expansion_mse = []

    for i in range(1, len(M)):

        print(M[i], end=' ')

        exp_TK = expansion_tokenised[M[i-1]:M[i]]
        exp_labels = expansion_set_labels[M[i-1]:M[i]]

        seed_NN = get_NN(model, seed_TK, k_neighbors)
        exp_NN = get_NN(model, exp_TK, k_neighbors)

        A = np.array(seed_NN)
        B = np.array(exp_NN).T
        C = sim_matrix(A, B, "cosine_sim")

        Y_ind = np.argmax(C, axis=0)
        Y = [seed_labels[x] for x in Y_ind]

        if(random_rate == 0.0):
            # no random sampling
            pass
        else:
            # random sampling
            Y_r = random.sample(range(0,len(Y)), cnt)
            for j in Y_r:
                if(Y[j] == exp_labels[j]):
                    count2[0] += 1
                Y[j] = exp_labels[j]

        expansion_predicted_labels.extend(Y)
        expansion_true_labels.extend(exp_labels)

        expansion_mse.append(mean_squared_error(exp_labels, Y))

        seed_labels.extend(Y)
        seed_TK.extend(exp_TK)

    return seed_TK, seed_labels, expansion_true_labels, expansion_predicted_labels, expansion_mse


In [47]:
X_train_exp, y_train_exp, et,ep, expansion_mse = Expand_R(model_2,
                                            X_seed.to_list(),
                                            y_seed.to_list(),
                                            X_expand.to_list(),
                                            y_expand.to_list(),
                                            800,
                                            len(y_expand),
                                            20,
                                            0.2
                                            )


800 

TypeError: mean_squared_error() got an unexpected keyword argument 'dtype'

In [None]:
print(mean_squared_error(et, ep, dtype='numeric'))
print(expansion_mse)

  y_true = check_array(y_true, ensure_2d=False, dtype=dtype)


0.304375
[0.3175, 0.29125]


  y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)


In [19]:
# Logistic Regression
LR_Normal_r = classify(model_2, LogisticRegression(
    random_state=1), X_train_exp, y_train_exp)
# SVM
SVM_Normal_r = classify(model_2, svm.SVC(), X_train_exp, y_train_exp)

models_r = []
models_r.append(('LR Normal N=2', LR_Normal_r))
models_r.append(('SVM Normal N=2', SVM_Normal_r))


In [20]:
file = open('results/output_random.txt', 'w+')
file.close()

outfile = open("results/output_random.txt", "a")
for i, v in models_r:
    print(i)
    accuracy = metrics.accuracy_score(y_test, v.predict(X_test))
    confusion_matrix = metrics.confusion_matrix(y_test, v.predict(X_test))
    print('========= {} Model Test Results ==========='.format(i), file=outfile)
    print(' ', file=outfile)
    print("Model Accuracy:" "\n", accuracy, file=outfile)
    print(accuracy)
    print(' ', file=outfile)
    print("Confusion matrix:" "\n", confusion_matrix, file=outfile)
    print(' ', file=outfile)
outfile.close()


LR Normal N=2
0.6539235412474849
SVM Normal N=2
0.6438631790744467


### Uncertainty Sampling

In [21]:
def Expand_U(model, algorithm, seed_set_TK, seed_set_label, expansion_TK,
             expansion_text_labels, batch_size, count, max_threshold):

    seed_TK = copy.deepcopy(seed_set_TK)
    seed_labels = copy.deepcopy(seed_set_label)
    M = np.arange(0, count, batch_size)

    # exp_TK_certain will be the list of comments having high proba score
    exp_TK_certain = []
    exp_TK_certain_labels = []

    try:

        for i in range(1, len(M)):

            print(M[i], end=' ')

            exp_TK = expansion_TK[M[i-1]:M[i]]
            exp_labels = expansion_text_labels[M[i-1]:M[i]]

            # take A as training and B as test and store probs in C
            v = classify(model, algorithm, seed_TK, seed_labels)
            C = v.predict_proba(exp_TK)
            C_abs_diff = [(abs(x[0] - x[1])) for x in C]

            # do sorting
            # [x for _, x in sorted(zip(Y, X))]
            exp_labels = [x for _, x in sorted(zip(C_abs_diff, exp_labels))]
            exp_TK = [x for _, x in sorted(zip(C_abs_diff, exp_TK))]
            C_sorted = [x for _, x in sorted(zip(C_abs_diff, C))]

            Y_uncertain = []
            exp_TK_uncertain = []
            for j in range(len(C_sorted)):
                max_value = max(C_sorted[j])
                max_index = str(np.argmax(C_sorted[j]))

                if(max_value <= max_threshold):
                    Y_uncertain.append(max_index)
                    exp_TK_uncertain.append(exp_TK[j])
                else:
                    exp_TK_certain.append(exp_TK[j])
                    exp_TK_certain_labels.append(exp_labels[j])

            seed_labels.extend(Y_uncertain)
            seed_TK.extend(exp_TK_uncertain)

    except:
        pass

    return seed_TK, seed_labels, exp_TK_certain, exp_TK_certain_labels


In [22]:
X_train_exp_u, y_train_exp_u, X_certain_exp_u, y_certain_exp_u = Expand_U(model_2,
                                                                          LogisticRegression(
                                                                              random_state=1),
                                                                          X_seed.to_list(),
                                                                          y_seed.to_list(),
                                                                          X_expand.to_list(),
                                                                          y_expand.to_list(),
                                                                          40,
                                                                          len(y_expand),
                                                                          0.8
                                                                          )


40 80 120 160 200 240 280 320 360 400 440 480 520 560 600 640 680 720 760 800 840 880 920 960 1000 1040 1080 

In [23]:
print(y_certain_exp_u)

982


In [24]:
# Logistic Regression
LR_Normal_u = classify(model_2, LogisticRegression(
    random_state=1), X_train_exp_u, y_train_exp_u)
# SVM
SVM_Normal_u = classify(model_2, svm.SVC(), X_train_exp_u, y_train_exp_u)

models_u = []
models_u.append(('LR Normal N=2', LR_Normal_u))
models_u.append(('SVM Normal N=2', SVM_Normal_u))


In [25]:
file = open('results/output_uncertain.txt', 'w+')
file.close()

outfile = open("results/output_uncertain.txt", "a")
for i, v in models_u:
    print(i)
    accuracy = metrics.accuracy_score(y_test, v.predict(X_test))
    confusion_matrix = metrics.confusion_matrix(y_test, v.predict(X_test))
    print('========= {} Model Test Results ==========='.format(i), file=outfile)
    print(' ', file=outfile)
    print("Model Accuracy:" "\n", accuracy, file=outfile)
    print(accuracy)
    print(' ', file=outfile)
    print("Confusion matrix:" "\n", confusion_matrix, file=outfile)
    print(' ', file=outfile)
outfile.close()


LR Normal N=2
0.5674044265593562
SVM Normal N=2
0.5674044265593562


### Certainty Sampling

In [26]:
X_train_certain, y_train_certain, count2 = Expand_R(model_2,
                                            X_train_exp_u,
                                            y_train_exp_u,
                                            X_certain_exp_u,
                                            y_certain_exp_u,
                                            20,
                                            len(y_certain_exp_u),
                                            20,
                                            0.2
                                            )


20 40 60 80 100 120 140 160 180 200 220 240 260 280 300 320 340 360 380 400 420 440 460 480 500 520 540 560 580 600 620 640 660 680 700 720 740 760 780 800 820 840 860 880 900 920 940 960 980 

In [27]:
# Logistic Regression
LR_Normal_cc = classify(model_2, LogisticRegression(
    random_state=1), X_train_certain, y_train_certain)
# SVM
SVM_Normal_cc = classify(model_2, svm.SVC(), X_train_certain, y_train_certain)

models_cc = []
models_cc.append(('LR Normal N=2', LR_Normal_cc))
models_cc.append(('SVM Normal N=2', SVM_Normal_cc))


In [28]:
file = open('results/output_certain.txt', 'w+')
file.close()

outfile = open("results/output_certain.txt", "a")
for i, v in models_cc:
    print(i)
    accuracy = metrics.accuracy_score(y_test, v.predict(X_test))
    confusion_matrix = metrics.confusion_matrix(y_test, v.predict(X_test))
    print('========= {} Model Test Results ==========='.format(i), file=outfile)
    print(' ', file=outfile)
    print("Model Accuracy:" "\n", accuracy, file=outfile)
    print(accuracy)
    print(' ', file=outfile)
    print("Confusion matrix:" "\n", confusion_matrix, file=outfile)
    print(' ', file=outfile)
outfile.close()


LR Normal N=2
0.5653923541247485
SVM Normal N=2
0.5674044265593562
