In [1]:
import fasttext
import numpy as np
import pandas as pd
import os
from scipy import spatial
from scipy.spatial import distance
from resources.basicIO import InputOutput as IO
from resources.filterLang import FilterLanguage as FL
from resources.tokTT import CommentTokenizer as CT
import copy

In [2]:
def NN(model, line, K):
    return model.get_nearest_neighbors(line, k=K)

In [3]:
# Takes 2 vectors a, b and returns the cosine similarity according 
# to the definition of the dot product
def cos_sim(a, b):
	dot_product = np.dot(a, b)
	norm_a = np.linalg.norm(a)
	norm_b = np.linalg.norm(b)
	return dot_product / (norm_a * norm_b)

In [4]:
# finds similarity score between two lists
def intersection_score(words, lst1, lst2, score_type):
    # words contains all the words in the corpus
    v1 = []
    v2 = []

    l1 = [x[1] for x in lst1]
    l2 = [x[1] for x in lst2]

    l10 = [x[0] for x in lst1]
    l20 = [x[0] for x in lst2]

    for i in range(len(words)):
        try:
            v1.append(l10[l1.index(words[i])])
        except:
            v1.append(0)

    for i in range(len(words)):
        try:
            v2.append(l20[l2.index(words[i])])
        except:
            v2.append(0)

    if(score_type == 'cosine_sim'):
        return cos_sim(np.array(v1), np.array(v2))
    else:
        return None

### Load Models

In [5]:
# load models
model_N_2 = fasttext.load_model('models/ft_unsupervised_N_2.bin')
model_N_3 = fasttext.load_model('models/ft_unsupervised_N_3.bin')



### Load Expansion Text and Labels

In [6]:
# loading from 200th comments
expansion_text = IO.load_csv_col('datasets/random_sample.csv','comment')
expansion_text = expansion_text[0:500]
expansion_text_labels = IO.load_csv_col('datasets/random_sample.csv','label')
expansion_text_labels = list(map(str,map(int,expansion_text_labels[0:500])))

### Tokenize Expansion text

In [7]:
expansion_TK = [CT.tokenize(x) for x in expansion_text]

### Load Seed Set

In [8]:
seed_set_text = IO.load_text('datasets/seed_set.txt')
seed_set_labels = IO.load_text('datasets/seed_set_labels.txt')

### Tokenize Seed Set

In [9]:
seed_set_TK = CT.cleaned('datasets/seed_set.txt')

### oracle interference

In [10]:
def oracleHelp(classdata):
    count = sum(classdata)
    res = any((((ele/count) >= 0.40 and (ele/count) <= 0.60)
              or ((ele/count) >= 0.85 and (ele/count) <= 1)) for ele in classdata)
    return res

### Expand Seed Set

In [11]:
# expands seed set, seed set labels and NN_seed_set based on scores and also verifies with user labels
# words: model.words, d1: NN_seed_set, l2: NN_exp_corpus_line, seed_set: seed_set, corpus_comment: exp_line, Y: labels
def expand(words, d1, l2, g_seed_set, g_corpus_comment, g_corpus_comment_label, Y, score_type, to_check='F',k=11):

    scores = [intersection_score(words, i, l2, score_type) for i in d1]
    maxpos = scores.index(max(scores))

    scores_array = np.array(scores)

    idx0 = 0
    idx1 = 0
    idx2 = 0
    for i in range(len(scores)):
        if(Y[i] == '0'):
            idx0 += 1
        elif(Y[i] == '1'):
            idx1 += 1
        elif(Y[i] == '2'):
            idx2 += 1

    # knn
    idx = scores_array.argsort()[::-1][:k]

    class_count = [0,0,0]

    for i in range(len(idx)):
        if(Y[idx[i]] == '0'):
            class_count[0] += 1
        elif(Y[idx[i]] == '1'):
            class_count[1] += 1
        elif(Y[idx[i]] == '2'):
            class_count[2] += 1

    max_class = class_count.index(max(class_count))
    if(max_class == 0):
        maxpos = idx0
    elif(max_class == 1):
        maxpos = idx1
    elif(max_class == 2):
        maxpos = idx2

    try:
        if(to_check == 'F'):
            d1.insert(Y.index(Y[maxpos]), l2)
            g_seed_set.insert(Y.index(Y[maxpos]), g_corpus_comment)
            Y.insert(Y.index(Y[maxpos]), Y[maxpos])
        elif(to_check == 'T' and oracleHelp(class_count) == False):
            d1.insert(Y.index(Y[maxpos]), l2)
            g_seed_set.insert(Y.index(Y[maxpos]), g_corpus_comment)
            Y.insert(Y.index(Y[maxpos]), Y[maxpos])
        elif(to_check == 'T' and oracleHelp(class_count) == True):
            
            if(Y[maxpos] != g_corpus_comment_label):
                d1.insert(Y.index(g_corpus_comment_label), l2)
                g_seed_set.insert(
                    Y.index(g_corpus_comment_label), g_corpus_comment)
                Y.insert(Y.index(g_corpus_comment_label),
                         g_corpus_comment_label)
            else:
                d1.insert(Y.index(Y[maxpos]), l2)
                g_seed_set.insert(Y.index(Y[maxpos]), g_corpus_comment)
                Y.insert(Y.index(Y[maxpos]), Y[maxpos])

    except:
        pass


In [12]:
def expand_seed_set(model,
                    seed_set_text,
                    seed_set_labels,
                    seed_set_TK,
                    expansion_text,
                    expansion_TK, 
                    neighbors=40,
                    expand_limit=500,
                    score_type='cosine_sim'):
    
    # nearest neighbors of seed set
    NN_seed_set = []
    for comment in seed_set_TK:
        NN_seed_set.append(NN(model,comment,neighbors))

    # nearest neigbors for unlabeled corpus from random sample
    NN_exp_corpus = []
    for comment in expansion_TK:
        NN_exp_corpus.append(NN(model,comment,neighbors))

    # seed set to be expanded
    seed_text_expanded = copy.deepcopy(seed_set_text)
    Y_expanded = copy.deepcopy(seed_set_labels)
    NN_seed_set_expanded = copy.deepcopy(NN_seed_set)

    # expand by expand_limit
    for i in range(expand_limit):
        
        to_check = 'T'
        expand(model.words,
               NN_seed_set_expanded,
               NN_exp_corpus[i],
               seed_text_expanded,
               expansion_text[i],
               expansion_text_labels[i],
               Y_expanded,
               score_type,
               to_check)

    return seed_text_expanded, Y_expanded     

### N=2

In [13]:
seed_text_expanded_N_2,Y_expanded_N_2 = expand_seed_set(model_N_2,
                                         seed_set_text,
                                         seed_set_labels,
                                         seed_set_TK,
                                         expansion_text,
                                         expansion_TK)

IO.save_text('datasets_post/seed_set_expanded_N_2.txt', seed_text_expanded_N_2)
IO.save_text('datasets_post/seed_set_expanded_labels_N_2.txt',
             map(str, Y_expanded_N_2))


.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.


KeyboardInterrupt: 

### N=3

In [None]:

seed_text_expanded_N_3,Y_expanded_N_3 = expand_seed_set(model_N_3,
                                         seed_set_text,
                                         seed_set_labels,
                                         seed_set_TK,
                                         expansion_text,
                                         expansion_TK)

IO.save_text('datasets_post/seed_set_expanded_N_3.txt', seed_text_expanded_N_3)
IO.save_text('datasets_post/seed_set_expanded_labels_N_3.txt',
             map(str, Y_expanded_N_3))


### Demo

In [None]:
# Nearest Neighbors
c1 = NN(model_N_2,'I support the farmers protest',20)
c2 = NN(model_N_2,'I am against the farmers protest',20)
c3 = NN(model_N_2, 'the farm bills are actually good the govt is doing right', 20)
print(c1)
print()
print(c2)
print()
print(c3)

In [None]:
# Cosine similarity
intersection_score(model_N_2.words,c1,c2,'cosine_sim')

In [None]:
intersection_score(model_N_2.words,c2,c3,'cosine_sim')