In [1]:
import numpy as np
import fasttext
import pandas as pd
import random
import itertools
from resources.tokTT import CommentTokenizer as CT
from resources.basicIO import InputOutput as IO
from resources.basicIO import InputOutput as IO
from resources.filterLang import FilterLanguage as FL
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.base import BaseEstimator, TransformerMixin
import fasttext
import pandas as pd
from scipy import spatial
from scipy.spatial import distance
from sklearn import svm
from sklearn import metrics
from sklearn.linear_model import SGDClassifier
#from sklearn.linear_model import LassoLars
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
import copy
import scipy


[nltk_data] Downloading package wordnet to C:\Users\AJAY
[nltk_data]     BISWAS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Preprocess Corpus and Random Sample

In [2]:
raw_corpus = IO.load_text('datasets/corpus.txt')
tokenized_corpus = CT.cleaned('datasets/corpus.txt')
IO.save_text('datasets/tokenized_corpus.txt',tokenized_corpus)

### Make Fasttext Unsupervised Model

In [3]:
model_2 = fasttext.train_unsupervised(
    input="datasets/tokenized_corpus.txt", lr=0.01, epoch=40, wordNgrams=2, dim=300)
model_2.save_model("models/ft_unsupervised_N_2.bin")


### Make dataframes

In [4]:
# corpus
df_dict = {'raw_comment': raw_corpus, 'tokenized_comment': tokenized_corpus}
df_corpus = pd.DataFrame(df_dict)
df_corpus.to_csv('datasets/corpus_data.csv', index=False)

# random sample
text = IO.load_csv_col('datasets/random_sample.csv', 'comment')
text_labels = IO.load_csv_col('datasets/random_sample.csv', 'label')
text_labels = list(map(str, map(int, text_labels)))
text_TK = [CT.tokenize(x) for x in text]

df_dict = {'raw_comment': text,
           'tokenized_comment': text_TK, 'label': text_labels}
df_sample = pd.DataFrame(df_dict)
df_sample.to_csv('datasets/random_sample_data.csv', index=False)


### Remove Unnecessary Comments

In [5]:
# remove unnecessary comments
df_sample = df_sample.drop([x for x in range(len(df_sample)) if int(
    df_sample['label'][x]) != 0 and int(df_sample['label'][x]) != 1])

In [6]:
df_sample.tail()


Unnamed: 0,raw_comment,tokenized_comment,label
3448,These new agri. laws are not feasible in india...,these new agri law be not feasible in india fi...,0
3450,"If this is Farmer protest , why we see only pu...",if this be farmer protest why we see only punj...,1
3451,UP police is trying to clear their image in th...,up police be try to clear their image in the w...,0
3452,Support farmers,support farmer,0
3453,"I agree to many points you made, but MSP is a ...",i agree to many point you make but msp be a do...,1


In [7]:
df_sample['label'].value_counts()


0    1408
1    1074
Name: label, dtype: int64

### Train test split

In [8]:

X_train, X_test, y_train, y_test = train_test_split(df_sample['tokenized_comment'],
                                                    df_sample['label'], test_size=0.2,
                                                    random_state=42,
                                                    stratify=df_sample['label'])


In [9]:
print('X_train: ' ,len(X_train))
print('X_test: ' ,len(X_test))

X_train:  1985
X_test:  497


### Classification

In [10]:
class FastTextTransformer(BaseEstimator, TransformerMixin):
    """ Convert texts into their mean fastText vectors """

    def __init__(self, model):
        self.model = model

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.stack([np.mean([self.model[w] for w in text.split()], 0) for text in X])


def classify(small_model, predictor, lines, Y):
    classifier = make_pipeline(
        FastTextTransformer(model=small_model),
        predictor
    ).fit(
        lines,
        Y
    )
    return classifier


### Classifier

In [11]:
# Logistic Regression
LR_Normal = classify(model_2, LogisticRegression(
    random_state=1), X_train, y_train)
# SVM
SVM_Normal = classify(model_2, svm.SVC(), X_train, y_train)

models = []
models.append(('LR Normal N=2', LR_Normal))
models.append(('SVM Normal N=2', SVM_Normal))


In [12]:
file = open('results/output_better.txt', 'w+')
file.close()

outfile = open("results/output_better.txt", "a")
for i, v in models:
    print(i)
    accuracy = metrics.accuracy_score(y_test, v.predict(X_test))
    confusion_matrix = metrics.confusion_matrix(y_test, v.predict(X_test))
    print('========= {} Model Test Results ==========='.format(i), file=outfile)
    print(' ', file=outfile)
    print("Model Accuracy:" "\n", accuracy, file=outfile)
    print(accuracy)
    print(' ', file=outfile)
    print("Confusion matrix:" "\n", confusion_matrix, file=outfile)
    print(' ', file=outfile)
outfile.close()


LR Normal N=2
0.7464788732394366
SVM Normal N=2
0.7645875251509054


### Active Learning

In [13]:
X_seed, X_expand, y_seed, y_expand = train_test_split(X_train,
                                                      y_train, test_size=0.99,
                                                      random_state=41,
                                                      stratify=y_train)


In [14]:
print('X_seed: ',len(X_seed))
print('X_expand: ',len(X_expand))

X_seed:  19
X_expand:  1966


In [15]:
y_seed.value_counts()


0    11
1     8
Name: label, dtype: int64

In [16]:
df_dict = {'seed_tokenized': X_seed,
           'label': y_seed}
df_seed = pd.DataFrame(df_dict)
df_seed.to_csv('datasets/seed_data.csv', index=False)

df_dict = {'expansion_tokenized': X_expand,
           'label': y_expand}
df_expand = pd.DataFrame(df_dict)
df_seed.to_csv('datasets/expand_data.csv', index=False)


### Cosine Similarity And Nearest Neighbors

In [17]:
def score(model, line, k):
    """ Returns a vector containing nearest neighbor scores w.r.t. all
        words in the model """
    # words contains all the words in the corpus
    lst1 = model.get_nearest_neighbors(line, k)
    v1 = []
    l1 = [x[1] for x in lst1]
    l10 = [x[0] for x in lst1]
    for i in range(len(model.words)):
        try:
            v1.append(l10[l1.index(model.words[i])])
        except:
            v1.append(0)
    return v1


def NN(model: fasttext.FastText._FastText, line: str, K):
    """ Returns k fasttext nearest neighbors of a given string """
    return model.get_nearest_neighbors(line, k=K)


def get_NN(model: fasttext.FastText._FastText, lines: list, k: int):
    """ Returns k nearest neighbor scores of multiple strings"""
    scores = []
    for line in lines:
        scores.append(score(model, line, k))
    return scores

def cos_sim(a: np.array, b: np.array):
    """ Returns cosine similarity of two 1d arrays """
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    if(norm_a * norm_b == 0.0):
        return dot_product / (norm_a * norm_b + 0.001)
    return dot_product / (norm_a * norm_b)


def sim(x: np.array, y: np.array, sim_type: str):
    if(sim_type == 'cosine_sim'):
        return cos_sim(x, y)


def sim_matrix(A: np.array, B: np.array, sim_type: str):
    """ find similarity score matrix between A and B. 
        A,B: 2d matrix of embeddings/nearest neighbor scores.
        sim_type: String denoting type of similarity.
    """
    m, p = A.shape
    p, n = B.shape
    C = np.zeros((m, n))
    for i in range(m):
        for j in range(n):
            C[i][j] = sim(A[i, :], B[:, j], sim_type)
    return C


### Expansion Code (Random Sampling)

In [18]:
def Expand_R(model: fasttext.FastText._FastText, 
             seed_set_tokenised: list, 
             seed_set_label: list, 
             expansion_tokenised: list, 
             expansion_set_labels: list, 
             batch_size: int, 
             k_neighbors: int, 
             random_rate: float):

    """
    Takes seed set and and expands the set using expansion_tokenised.
    Batch size: no. of texts to be inserted in one go
    k_neighbors: no. of neighbors for getting cosine similarity
    random_rate: fraction of amount taken for random sampling
    """
             
    seed_TK = copy.deepcopy(seed_set_tokenised)
    seed_labels = copy.deepcopy(seed_set_label)
    count = len(expansion_set_labels)
    M = np.arange(0, count, batch_size)
    cnt = int(random_rate * batch_size)
    
    expansion_predicted_labels = []
    expansion_true_labels = []
    expansion_accuracy = []

    for i in range(1, len(M)):

        print(M[i], end=' ')

        # select batchwise expansion text
        exp_TK = expansion_tokenised[M[i-1]:M[i]]
        exp_labels = expansion_set_labels[M[i-1]:M[i]]

        # nearest neighbors
        seed_NN = get_NN(model, seed_TK, k_neighbors)
        exp_NN = get_NN(model, exp_TK, k_neighbors)

        A = np.array(seed_NN)
        B = np.array(exp_NN).T
        C = sim_matrix(A, B, "cosine_sim")

        # find rowwise (seed) index of highest similarity
        Y_ind = np.argmax(C, axis=0)
        # get labels
        Y = [seed_labels[x] for x in Y_ind]

        if(random_rate == 0.0):
            # no random sampling
            pass
        else:
            # random sampling
            Y_r = random.sample(range(0,len(Y)), cnt)
            for j in Y_r:
                Y[j] = exp_labels[j]

        # calc. expansion accuracy
        expansion_predicted_labels.extend(Y)
        expansion_true_labels.extend(exp_labels)
        expansion_accuracy.append(metrics.accuracy_score(exp_labels, Y))

        # expand seed set
        seed_labels.extend(Y)
        seed_TK.extend(exp_TK)

    return seed_TK, seed_labels, expansion_true_labels, expansion_predicted_labels, expansion_accuracy


### Uncertainty Sampling

In [19]:
def Expand_U(model: fasttext.FastText._FastText, 
             algorithm: object, 
             seed_set_tokenised: list, 
             seed_set_label: list, 
             expansion_tokenised: list,
             expansion_set_labels: list, 
             batch_size: int,
             countMax: int):
    """ Uncertainty sampling.
    Expand seed set using expansion_set based on lowest confidance scores.
    max_threshold: max. probability for uncertainty selection """

    seed_TK = copy.deepcopy(seed_set_tokenised)
    seed_labels = copy.deepcopy(seed_set_label)
    count = len(expansion_set_labels)
    M = np.arange(0, count, batch_size)
    confusion_matrices = []

    # exp_TK_certain will be the list of comments having high proba score
    exp_TK_certain = []
    exp_TK_certain_labels = []


    for i in range(1, len(M)):

        #print(M[i], end=' ')

        exp_TK = expansion_tokenised[M[i-1]:M[i]]
        exp_labels = expansion_set_labels[M[i-1]:M[i]]

        # take A as training and B as test and store probs in C
        small_model = classify(model, algorithm, seed_TK, seed_labels)
        # store classwise prob. scores
        C = small_model.predict_proba(exp_TK)
        # Uncertainty sampling scores
        C_abs_diff = [(abs(x[0] - x[1])) for x in C]

        # store accuracies
        confusion_matrices.append(metrics.confusion_matrix(exp_labels, small_model.predict(exp_TK)))

        # Sort lists in ascending order of probabilities from C_abs_diff
        sorted_lists = sorted(zip(exp_labels, exp_TK, C, C_abs_diff), key=lambda x: x[3])
        exp_labels, exp_TK, C_sorted, score = [[x[i] for x in sorted_lists] for i in range(4)]

        Y_uncertain = []
        exp_TK_uncertain = []
        for j in range(len(C_sorted)):
            max_value = max(C_sorted[j])
            max_index = str(np.argmax(C_sorted[j]))

            # label the comments whose score is less than threshold
            if(j < countMax):

                exp_TK_uncertain.append(exp_TK[j])
                Y_uncertain.append(exp_labels[j])
            else:
                exp_TK_certain.append(exp_TK[j])
                exp_TK_certain_labels.append(exp_labels[j])

        # expand the seed set
        seed_labels.extend(Y_uncertain)
        seed_TK.extend(exp_TK_uncertain)


    return seed_TK, seed_labels, exp_TK_certain, exp_TK_certain_labels, confusion_matrices


In [20]:
X_uncertain_exp_u, y_uncertain_exp_u, X_certain_exp_u, y_certain_exp_u, confusion_matrices = Expand_U(model_2,
                                                                          LogisticRegression(random_state=1),
                                                                          list(X_seed),
                                                                          list(y_seed),
                                                                          list(X_expand),
                                                                          list(y_expand),
                                                                          20,
                                                                          1
                                                                          )


In [21]:
print(len(X_seed))
print(len(X_uncertain_exp_u))
print(len(X_certain_exp_u))

19
117
1862


In [22]:
# Logistic Regression
LR_Normal_u = classify(model_2, LogisticRegression(
    random_state=1), X_uncertain_exp_u, y_uncertain_exp_u)
# SVM
SVM_Normal_u = classify(model_2, svm.SVC(), X_uncertain_exp_u, y_uncertain_exp_u)

# GNB
GNB_Normal_u = classify(model_2, GaussianNB(), X_uncertain_exp_u, y_uncertain_exp_u)

# DT
DT_Normal_u = classify(model_2, DecisionTreeClassifier(),X_uncertain_exp_u, y_uncertain_exp_u)

models_u = []
models_u.append(('LR Normal N=2', LR_Normal_u))
models_u.append(('SVM Normal N=2', SVM_Normal_u))
models_u.append(('GNB Normal N=2', GNB_Normal_u))
models_u.append(('DT Normal N=2', DT_Normal_u))


In [23]:
file = open('results/output_uncertain.txt', 'w+')
file.close()

outfile = open("results/output_uncertain.txt", "a")
for i, v in models_u:
    print(i)
    accuracy = metrics.accuracy_score(y_test, v.predict(X_test))
    confusion_matrix = metrics.confusion_matrix(y_test, v.predict(X_test))
    print('========= {} Model Test Results ==========='.format(i), file=outfile)
    print(' ', file=outfile)
    print("Model Accuracy:" "\n", accuracy, file=outfile)
    print(accuracy)
    print(' ', file=outfile)
    print("Confusion matrix:" "\n", confusion_matrix, file=outfile)
    print(' ', file=outfile)
outfile.close()


LR Normal N=2
0.7283702213279678
SVM Normal N=2
0.7082494969818913
GNB Normal N=2
0.7142857142857143
DT Normal N=2
0.6016096579476862


### Automatic 

In [24]:
from joblib import Parallel, delayed

def process(i,j):
    return i * j


results = Parallel(n_jobs=2)(delayed(process)(i,j) for i in range(10) for j in range(10))
print(results)  # prints [0, 1, 4, 9, 16, 25, 36, 49, 64, 81]


[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 0, 6, 12, 18, 24, 30, 36, 42, 48, 54, 0, 7, 14, 21, 28, 35, 42, 49, 56, 63, 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 0, 9, 18, 27, 36, 45, 54, 63, 72, 81]


In [25]:
from joblib import Parallel, delayed

models = [('fasttext', 'models/ft_unsupervised_N_2_gen.bin')]
algorithms = ['lr', 'svm']
batch_sizes = [10, 20, 30, 40, 50, 100]
best_amounts = [1, 2, 5]
outp_algorithms = ['lr', 'svm']

def process(models, algorithms,batch_sizes,best_amounts,outp_algorithms):

    big_confusion = []
    big_results = []
    
    for md in models:
        mdl = None
        if(md[0] == 'fasttext'):
            mdl = fasttext.load_model(md[1])

        for algorithm in algorithms:
            alg = None
            if(algorithm == 'lr'):
                alg = LogisticRegression()
            elif(algorithm == 'svm'):
                alg = svm.SVC()

            for batch_size in batch_sizes:
                for best_amount in best_amounts:
                    for outp_algorithm in outp_algorithms:

                        X_uncertain_exp_u,
                        y_uncertain_exp_u,
                        X_certain_exp_u,
                        y_certain_exp_u,
                        confusion_matrices = Expand_U(mdl,
                                                    alg,
                                                    list(X_seed),
                                                    list(y_seed),
                                                    list(X_expand),
                                                    list(y_expand),
                                                    batch_size,
                                                    best_amount
                                                    )

                        alg2 = None
                        if(outp_algorithm == 'lr'):
                            alg2 = LogisticRegression()
                        elif(outp_algorithm == 'svm'):
                            alg2 = svm.SVC()

                        # classify
                        c_model = classify(mdl, alg2,X_uncertain_exp_u, y_uncertain_exp_u)
                        c_accuracy = metrics.accuracy_score(y_test, c_model.predict(X_test))

                        big_results.append((algorithm,batch_size,best_amount,outp_algorithm,c_accuracy))
                        #print(algorithm, ',', batch_size, ',', best_amount,',', outp_algorithm, ',', c_accuracy)
                        big_confusion.append(confusion_matrices)
    return big_results, big_confusion



results = Parallel(n_jobs=4)(delayed(process)(m,i,j,k,l) for m in models for i in algorithms for j in batch_sizes for k in best_amounts for l in outp_algorithms )
print(results)


TypeError: 'int' object is not iterable

In [26]:
print('algorithm',',','batch_size',', ','best_amount',', ','outp_algorithm',',','c_accuracy')

models = [('fasttext', 'models/ft_unsupervised_N_2.bin')]
algorithms = ['lr', 'svm']
batch_sizes = [10, 20, 30, 40, 50, 100]
best_amounts = [1, 2, 5]
outp_algorithms = ['lr', 'svm']

big_confusion = []
big_results = []

for md in models:
    mdl = None
    if(md[0] == 'fasttext'):
        mdl = fasttext.load_model(md[1])

    for algorithm in algorithms:
        alg = None
        if(algorithm == 'lr'):
            alg = LogisticRegression()
        elif(algorithm == 'svm'):
            alg = svm.SVC(probability=True)

        for batch_size in batch_sizes:
            for best_amount in best_amounts:
                 for outp_algorithm in outp_algorithms:

                    X_uncertain_exp_u,
                    y_uncertain_exp_u,
                    X_certain_exp_u,
                    y_certain_exp_u,
                    confusion_matrices = Expand_U(mdl,
                                                    alg,
                                                    list(X_seed),
                                                    list(y_seed),
                                                    list(X_expand),
                                                    list(y_expand),
                                                    batch_size,
                                                    best_amount
                                                    )

                    alg2 = None
                    if(outp_algorithm == 'lr'):
                        alg2 = LogisticRegression()
                    elif(outp_algorithm == 'svm'):
                        alg2 = svm.SVC()

                    # classify
                    c_model = classify(
                        mdl, alg2, X_uncertain_exp_u, y_uncertain_exp_u)
                    c_accuracy = metrics.accuracy_score(
                        y_test, c_model.predict(X_test))

                    big_results.append((algorithm, batch_size, best_amount, outp_algorithm, c_accuracy))
                    print(algorithm, ',', batch_size, ',', best_amount,',', outp_algorithm, ',', c_accuracy)
                    big_confusion.append(confusion_matrices)


algorithm , batch_size ,  best_amount ,  outp_algorithm , c_accuracy




lr , 10 , 1 , lr , 0.7283702213279678
lr , 10 , 1 , svm , 0.7082494969818913
lr , 10 , 2 , lr , 0.7283702213279678
lr , 10 , 2 , svm , 0.7082494969818913
lr , 10 , 5 , lr , 0.7283702213279678
lr , 10 , 5 , svm , 0.7082494969818913
lr , 20 , 1 , lr , 0.7283702213279678
lr , 20 , 1 , svm , 0.7082494969818913
lr , 20 , 2 , lr , 0.7283702213279678
lr , 20 , 2 , svm , 0.7082494969818913
lr , 20 , 5 , lr , 0.7283702213279678
lr , 20 , 5 , svm , 0.7082494969818913
lr , 30 , 1 , lr , 0.7283702213279678
lr , 30 , 1 , svm , 0.7082494969818913
lr , 30 , 2 , lr , 0.7283702213279678
lr , 30 , 2 , svm , 0.7082494969818913
lr , 30 , 5 , lr , 0.7283702213279678
lr , 30 , 5 , svm , 0.7082494969818913
lr , 40 , 1 , lr , 0.7283702213279678
lr , 40 , 1 , svm , 0.7082494969818913
lr , 40 , 2 , lr , 0.7283702213279678
lr , 40 , 2 , svm , 0.7082494969818913
lr , 40 , 5 , lr , 0.7283702213279678
lr , 40 , 5 , svm , 0.7082494969818913
lr , 50 , 1 , lr , 0.7283702213279678
lr , 50 , 1 , svm , 0.7082494969818913