In [137]:
import fasttext
import numpy as np
import pandas as pd
import os
from scipy import spatial
from scipy.spatial import distance
from resources.basicIO import InputOutput as IO
from resources.filterLang import FilterLanguage as FL
from resources.tokTT import CommentTokenizer as CT
import copy

In [138]:
def NN(model, line, K):
    return model.get_nearest_neighbors(line, k=K)

In [139]:
# Takes 2 vectors a, b and returns the cosine similarity according 
# to the definition of the dot product
def cos_sim(a, b):
	dot_product = np.dot(a, b)
	norm_a = np.linalg.norm(a)
	norm_b = np.linalg.norm(b)
	return dot_product / (norm_a * norm_b)

In [140]:
# finds similarity score between two lists
def intersection_score(words, lst1, lst2, score_type):
    # words contains all the words in the corpus
    v1 = []
    v2 = []

    l1 = [x[1] for x in lst1]
    l2 = [x[1] for x in lst2]

    l10 = [x[0] for x in lst1]
    l20 = [x[0] for x in lst2]

    for i in range(len(words)):
        try:
            v1.append(l10[l1.index(words[i])])
        except:
            v1.append(0)

    for i in range(len(words)):
        try:
            v2.append(l20[l2.index(words[i])])
        except:
            v2.append(0)

    if(score_type == 'cosine_sim'):
        return cos_sim(np.array(v1), np.array(v2))
    else:
        return None

### Load Models

In [141]:
# load models
model_N_2 = fasttext.load_model('models/ft_unsupervised_N_2.bin')
model_N_3 = fasttext.load_model('models/ft_unsupervised_N_3.bin')



### Load Expansion Text

In [142]:
expansion_text = IO.load_csv_col('datasets/random_sample.csv','comment')
# loading from 100th comment
expansion_text = expansion_text[200:500]

### Tokenize Expansion text

In [143]:
expansion_TK = [CT.tokenize(x) for x in expansion_text]

### Load Seed Set

In [144]:
seed_set_text = IO.load_text('datasets/seed_set.txt')
seed_set_labels = IO.load_text('datasets/seed_set_labels.txt')

### Tokenize Seed Set

In [145]:
seed_set_TK = CT.cleaned('datasets/seed_set.txt')

### Expand Seed Set

In [146]:
# words: model.words, d1: NN_seed_set, l2: NN_exp_corpus_line, seed_set: seed_set, corpus_comment: exp_line, Y: labels
def expand(words, d1, l2, g_seed_set, g_corpus_comment, Y, score_type):

    scores = [intersection_score(words, i, l2, score_type) for i in d1]
    maxpos = scores.index(max(scores))

    try:
        if(Y[maxpos] == '0'):
            Y.insert(0, '0')
            d1.insert(0, l2)
            g_seed_set.insert(0, g_corpus_comment)
        elif(Y[maxpos] == '2'):
            Y.append('2')
            d1.append(l2)
            g_seed_set.append(g_corpus_comment)
        elif(Y[maxpos] == '1'):
            d1.insert(Y.index('1'), l2)
            g_seed_set.insert(Y.index('1'), g_corpus_comment)
            Y.insert(Y.index('1'), '1')
    except:
        pass


In [147]:
def expand_seed_set(model,
                    seed_set_text,
                    seed_set_labels,
                    seed_set_TK,
                    expansion_text,
                    expansion_TK, 
                    neighbors=20,
                    expand_limit=10,
                    score_type='cosine_sim'):
    
    # nearest neighbors of seed set
    NN_seed_set = []
    for comment in seed_set_TK:
        NN_seed_set.append(NN(model,comment,neighbors))

    # nearest neigbors for unlabeled corpus from random sample
    NN_exp_corpus = []
    for comment in expansion_TK:
        NN_exp_corpus.append(NN(model,comment,neighbors))

    # seed set to be expanded
    seed_text_expanded = copy.deepcopy(seed_set_text)
    Y_expanded = copy.deepcopy(seed_set_labels)
    NN_seed_set_expanded = copy.deepcopy(NN_seed_set)

    # expand by expand_limit
    for i in range(expand_limit):
        expand(model.words,
               NN_seed_set_expanded,
               NN_exp_corpus[i],
               seed_text_expanded,
               expansion_text[i],
               Y_expanded,
               score_type)

    return seed_text_expanded, Y_expanded     

### N=2

In [148]:
seed_text_expanded_N_2,Y_expanded_N_2 = expand_seed_set(model_N_2,
                                         seed_set_text,
                                         seed_set_labels,
                                         seed_set_TK,
                                         expansion_text,
                                         expansion_TK)

### N=3

In [149]:
seed_text_expanded_N_3,Y_expanded_N_3 = expand_seed_set(model_N_3,
                                         seed_set_text,
                                         seed_set_labels,
                                         seed_set_TK,
                                         expansion_text,
                                         expansion_TK)


### Save the expanded seed set and labels

In [150]:
IO.save_text('datasets_post/seed_set_expanded_N_2.txt', seed_text_expanded_N_2)
IO.save_text('datasets_post/seed_set_expanded_labels_N_2.txt',
             map(str, Y_expanded_N_2))

IO.save_text('datasets_post/seed_set_expanded_N_3.txt', seed_text_expanded_N_3)
IO.save_text('datasets_post/seed_set_expanded_labels_N_3.txt',
             map(str, Y_expanded_N_3))
