In [15]:
import fasttext
import re
import numpy as np
import pandas as pd
import os
from nltk import TweetTokenizer
from nltk.tokenize.stanford import StanfordTokenizer


## Tokenization

In [16]:
def tokenize(tknzr, sentence, to_lower=True):
    """Arguments:
        - tknzr: a tokenizer implementing the NLTK tokenizer interface
        - sentence: a string to be tokenized
        - to_lower: lowercasing or not
    """
    sentence = sentence.strip()
    sentence = ' '.join([format_token(x) for x in tknzr.tokenize(sentence)])
    if to_lower:
        sentence = sentence.lower()
    sentence = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))','<url>',sentence) #replace urls by <url>
    sentence = re.sub('(\@[^\s]+)','<user>',sentence) #replace @user268 by <user>
    filter(lambda word: ' ' not in word, sentence)
    return sentence

def format_token(token):
    """"""
    if token == '-LRB-':
        token = '('
    elif token == '-RRB-':
        token = ')'
    elif token == '-RSB-':
        token = ']'
    elif token == '-LSB-':
        token = '['
    elif token == '-LCB-':
        token = '{'
    elif token == '-RCB-':
        token = '}'
    return token

## Nearest neighbor

In [17]:
def NN(model,line,K):
    return model.get_nearest_neighbors(line, k=K)


## Main

In [18]:
# load model
model = fasttext.load_model('fp_bigrams.bin')



In [19]:
# Load seed set
text_file = open("seed_set.txt", "r")
no_str = text_file.read()
text_file.close()
# make a list
seed_set = no_str.split("\n")


In [20]:
# Load corpus
text_file2 = open("corpus.txt", "r")
no_str2 = text_file2.read()
text_file2.close()
# make a list
corpus = no_str2.split("\n")

In [21]:
# nearest neigbors for seed set, setting k = 20
NN_seed_set = []
for comment in seed_set:
    tok_comment = tokenize(TweetTokenizer(),comment)
    NN_seed_set.append(NN(model,tok_comment,20))

In [22]:
# nearest neigbors for unlabeled corpus, setting k = 20
NN_corpus = []
for comment in corpus:
    tok_comment = tokenize(TweetTokenizer(), comment)
    NN_corpus.append(NN(model,tok_comment,20))

In [23]:
print(NN_seed_set)


[[(0.9046760201454163, 'protest'), (0.8575693964958191, 'farmer'), (0.8199014663696289, 'bill'), (0.8011854290962219, 'Modi'), (0.7877769470214844, 'farmers'), (0.7787935137748718, 'Support'), (0.7725008726119995, 'protests'), (0.7656182050704956, 'protesting'), (0.7496612668037415, 'never'), (0.7412481307983398, 'mandi'), (0.7395045161247253, 'support'), (0.7342778444290161, 'indian'), (0.7317741513252258, 'supporting'), (0.7236142754554749, 'anti'), (0.7225158214569092, 'problem'), (0.7213343977928162, 'Farmers'), (0.704829752445221, 'rich'), (0.7038158774375916, 'leaders'), (0.6987590789794922, 'shameless'), (0.6975761651992798, 'money')], [(0.9263677000999451, 'please'), (0.8951196670532227, 'india'), (0.863299548625946, 'People'), (0.818062961101532, 'stop'), (0.8104921579360962, 's'), (0.8056684732437134, 'r'), (0.7964104413986206, 'country'), (0.7696121335029602, 'terrorists'), (0.7683792114257812, 'u'), (0.7500729560852051, 'democracy'), (0.7497097253799438, 'people'), (0.74374