In [14]:
import fasttext
import re
import numpy as np
import pandas as pd
import os
from nltk import TweetTokenizer
from nltk.tokenize.stanford import StanfordTokenizer
from scipy.spatial import distance

## Tokenization

In [15]:
def tokenize(tknzr, sentence, to_lower=True):
    """Arguments:
        - tknzr: a tokenizer implementing the NLTK tokenizer interface
        - sentence: a string to be tokenized
        - to_lower: lowercasing or not
    """
    sentence = sentence.strip()
    sentence = ' '.join([format_token(x) for x in tknzr.tokenize(sentence)])
    if to_lower:
        sentence = sentence.lower()
    sentence = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))','<url>',sentence) #replace urls by <url>
    sentence = re.sub('(\@[^\s]+)','<user>',sentence) #replace @user268 by <user>
    filter(lambda word: ' ' not in word, sentence)
    return sentence

def format_token(token):
    """"""
    if token == '-LRB-':
        token = '('
    elif token == '-RRB-':
        token = ')'
    elif token == '-RSB-':
        token = ']'
    elif token == '-LSB-':
        token = '['
    elif token == '-LCB-':
        token = '{'
    elif token == '-RCB-':
        token = '}'
    return token

## Nearest neighbor

In [16]:
def NN(model,line,K):
    return model.get_nearest_neighbors(line, k=K)


## Cosine Distance

In [17]:
def cosine(v1,v2):
    return distance.cosine(v1,v2)

## Main

### Load data

In [18]:
# load model
model = fasttext.load_model('fp_bigrams_unsupervised.bin')



In [19]:
# Load seed set
text_file = open("seed_set.txt", "r")
no_str = text_file.read()
text_file.close()
# make a list
seed_set = no_str.split("\n")


In [20]:
# Load corpus
text_file2 = open("corpus.txt", "r")
no_str2 = text_file2.read()
text_file2.close()
# make a list
corpus = no_str2.split("\n")

In [21]:
# labels for 10 seed set
Y = [1,1,1,1,1,0,0,0,0,0]

### Nearest Neighbor sets

In [22]:
# nearest neigbors for seed set, setting k = 20
NN_seed_set = []
for comment in seed_set:
    tok_comment = tokenize(TweetTokenizer(),comment)
    NN_seed_set.append(NN(model,tok_comment,20))

In [23]:
# nearest neigbors for unlabeled corpus, setting k = 20
NN_corpus = []
for comment in corpus:
    tok_comment = tokenize(TweetTokenizer(), comment)
    NN_corpus.append(NN(model,tok_comment,20))

### NN Sampling by finding distance

In [24]:
print(NN_seed_set[0])

[(0.7868461012840271, 'farmer'), (0.7597593665122986, 'protest'), (0.7351622581481934, 'protests'), (0.6815500855445862, 'should'), (0.648932933807373, 'anti'), (0.634634256362915, 'mandi'), (0.6068609356880188, 'bill'), (0.6053192019462585, 'ram'), (0.6050131916999817, 'without'), (0.5900097489356995, 'farmers'), (0.5886179208755493, 'live'), (0.5833055973052979, 'by'), (0.5544314384460449, 'these'), (0.5535263419151306, 'if'), (0.5504714250564575, 'law'), (0.5485056638717651, 'doing'), (0.5484163761138916, 'rich'), (0.5476019382476807, 'there'), (0.5462289452552795, 'let'), (0.528472900390625, 'jai')]


In [25]:
print(NN_corpus[0])

[(0.8319745063781738, 'without'), (0.6469683647155762, 'middle'), (0.6072287559509277, 'mandi'), (0.6034027338027954, 'should'), (0.5961973071098328, 'such'), (0.5930907726287842, 'anti'), (0.5862780809402466, 'let'), (0.5781086087226868, 'ravish'), (0.5633074641227722, 'out'), (0.5371558666229248, 'there'), (0.5352479219436646, 'way'), (0.5222377181053162, 'help'), (0.5206180214881897, 'farmer'), (0.5202722549438477, 'need'), (0.5191149115562439, 'these'), (0.51839280128479, 'shameless'), (0.5153034925460815, 'bill'), (0.5109532475471497, 'all'), (0.508699357509613, 'many'), (0.5083184838294983, 'for')]
