In [402]:
import string
import fasttext
import math
import re
import numpy as np
import pandas as pd
import os
from nltk import TweetTokenizer
from nltk.tokenize.stanford import StanfordTokenizer
from scipy import spatial
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
cachedStopWords = stopwords.words("english")
CLEANR = re.compile('<.*?>')


[nltk_data] Downloading package stopwords to C:\Users\AJAY
[nltk_data]     BISWAS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Tokenization

In [403]:
def remove_punctuation(text, punct_list):
    for punc in punct_list:
        if punc in text:
            text = text.replace(punc, ' ')
    return text.strip()


def remove_html_tags(raw_html):
    cleantext = re.sub(CLEANR, '', raw_html)
    return cleantext

def tokenize(sentence, to_lower=True, tknzr=TweetTokenizer()):
    """Arguments:
        - tknzr: a tokenizer implementing the NLTK tokenizer interface
        - sentence: a string to be tokenized
        - to_lower: lowercasing or not
    """
    sentence = sentence.strip()
    sentence = ' '.join([format_token(x) for x in tknzr.tokenize(sentence)])
    if to_lower:
        sentence = sentence.lower()
    # replace urls by <url>
    sentence = re.sub(
        '((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))', '', sentence)
    # replace @user268 by <user>
    sentence = re.sub('(\@[^\s]+)', '', sentence)

    filter(lambda word: ' ' not in word, sentence)

    #remove single letter words
    sentence = ' '.join([w for w in sentence.split() if len(w) > 1])

    sentence = remove_html_tags(sentence)
    sentence = remove_punctuation(sentence, list(string.punctuation))
    sentence = ' '.join([word for word in sentence.split()
                        if word not in cachedStopWords])
    return sentence

def format_token(token):
    """"""
    if token == '-LRB-':
        token = '('
    elif token == '-RRB-':
        token = ')'
    elif token == '-RSB-':
        token = ']'
    elif token == '-LSB-':
        token = '['
    elif token == '-LCB-':
        token = '{'
    elif token == '-RCB-':
        token = '}'
    return token

## Nearest neighbor

In [404]:
def NN(model,line,K):
    return model.get_nearest_neighbors(line, k=K)


### Cosine Similarity

In [405]:
def cosine_similarity(v1, v2):

    sumxx, sumxy, sumyy = 0, 0, 0
    for i in range(len(v1)):
        x = v1[i]
        y = v2[i]
        sumxx += x*x
        sumyy += y*y
        sumxy += x*y
    if(math.sqrt(sumxx*sumyy) == 0):
        return 0
    else:
        return sumxy/math.sqrt(sumxx*sumyy)


### Cosine Similarity (https://paulminogue.com/index.php/2019/09/29/introduction-to-cosine-similarity/)

## Main

### Load data

In [406]:
# load model
model = fasttext.load_model('fp_bigrams_unsupervised.bin')



In [407]:
# Load seed set
text_file = open("seed_set.txt", "r")
no_str = text_file.read()
text_file.close()
# make a list
seed_set = no_str.split("\n")


In [408]:
# Load corpus
text_file2 = open("corpus.txt", "r")
no_str2 = text_file2.read()
text_file2.close()
# make a list
corpus = no_str2.split("\n")

### Nearest Neighbor sets

In [409]:
# nearest neigbors for seed set, setting k = 20
NN_seed_set = []
for comment in seed_set:
    tok_comment = tokenize(comment)
    NN_seed_set.append(NN(model,tok_comment,20))

In [410]:
# nearest neigbors for unlabeled corpus, setting k = 20
NN_corpus = []
for comment in corpus:
    tok_comment = tokenize(comment)
    NN_corpus.append(NN(model,tok_comment,20))

### NN Sampling by finding distance

In [411]:
print(NN_seed_set[0])

[(0.8419418334960938, 'farmer'), (0.7533829808235168, 'bill'), (0.747085452079773, 'let'), (0.7365711331367493, 'win'), (0.7077261209487915, 'protesting'), (0.687737226486206, 'back'), (0.686690628528595, 'without'), (0.6720230579376221, 'mandi'), (0.6683642268180847, 'middle'), (0.6678685545921326, 'fighting'), (0.6671098470687866, 'ask'), (0.6629432439804077, 'face'), (0.6585047245025635, 'terrorists'), (0.6526790857315063, 'months'), (0.6507577300071716, 'cannot'), (0.6430784463882446, 'farm'), (0.6409282684326172, 'lol'), (0.6402221918106079, 'protest'), (0.6371271014213562, 'violence'), (0.636122465133667, 'take')]


### Seed Set Expansion

In [414]:
def intersection_score(lst1, lst2):
    l1 = [x[1] for x in lst1]
    l2 = [x[1] for x in lst2]

    l10 = [x[0] for x in lst1]
    l20 = [x[0] for x in lst2]


    lst3 = [value for value in l1 if value in l2]

    v1= []
    v2 = []

    for i in range(len(l1)):
        for j in range(len(lst3)):
            if(l1[i] == lst3[j]):
                v1.append(l10[i])

    for i in range(len(l2)):
        for j in range(len(lst3)):
            if(l2[i] == lst3[j]):
                v2.append(l20[i])

    return cosine_similarity(v1,v2)

In [415]:
def expand_label(d1, l2, seed_set, corpus_comment, Y):

    scores = [intersection_score(i, l2) for i in d1]
    maxpos = scores.index(max(scores))
    
    if(Y[maxpos] == 0):
        Y.append(0)
        d1.append(l2)
        seed_set.append(corpus_comment)
    else:
        Y.insert(0,1)
        d1.insert(0,l2)
        seed_set.insert(0,corpus_comment)



In [416]:
# labels for 10 seed set
Y = [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]

seed_set_expanded = seed_set
Y_expanded = Y
NN_seed_set_expanded = NN_seed_set

# expand by 100 comments
for i in range(20): 
    expand_label(NN_seed_set_expanded,NN_corpus[i+1000],seed_set_expanded, corpus[i+1000], Y_expanded)

In [417]:
# save as txt
with open('seed_set_expanded.txt', mode='wt', encoding='utf-8') as myfile:
    myfile.write('\n'.join(seed_set_expanded))

Y_expanded_string = map(str, Y_expanded)
with open('seed_set_expanded_labels.txt', mode='wt', encoding='utf-8') as myfile:
    myfile.write('\n'.join(Y_expanded_string))
