In [101]:
import string
import fasttext
import re
import numpy as np
import pandas as pd
import os
from nltk import TweetTokenizer
from nltk.tokenize.stanford import StanfordTokenizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
cachedStopWords = stopwords.words("english")


[nltk_data] Downloading package stopwords to C:\Users\AJAY
[nltk_data]     BISWAS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Tokenization

In [102]:
def remove_punctuation(text, punct_list):
    for punc in punct_list:
        if punc in text:
            text = text.replace(punc, ' ')
    return text.strip()


def remove_html_tags(text):
    import re
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

def tokenize(sentence, to_lower=True, tknzr=TweetTokenizer()):
    """Arguments:
        - tknzr: a tokenizer implementing the NLTK tokenizer interface
        - sentence: a string to be tokenized
        - to_lower: lowercasing or not
    """
    sentence = sentence.strip()
    sentence = ' '.join([format_token(x) for x in tknzr.tokenize(sentence)])
    if to_lower:
        sentence = sentence.lower()
    # replace urls by <url>
    sentence = re.sub(
        '((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))', '', sentence)
    # replace @user268 by <user>
    sentence = re.sub('(\@[^\s]+)', '', sentence)

    filter(lambda word: ' ' not in word, sentence)

    #remove single letter words
    sentence = ' '.join([w for w in sentence.split() if len(w) > 1])

    sentence = remove_html_tags(sentence)
    regular_punct = list(string.punctuation)
    sentence = remove_punctuation(sentence, regular_punct)
    sentence = ' '.join([word for word in sentence.split()
                        if word not in cachedStopWords])
    return sentence

def format_token(token):
    """"""
    if token == '-LRB-':
        token = '('
    elif token == '-RRB-':
        token = ')'
    elif token == '-RSB-':
        token = ']'
    elif token == '-LSB-':
        token = '['
    elif token == '-LCB-':
        token = '{'
    elif token == '-RCB-':
        token = '}'
    return token

## Nearest neighbor

In [103]:
def NN(model,line,K):
    return model.get_nearest_neighbors(line, k=K)


### Cosine Similarity

In [104]:
def get_cosine_similarity(feature_vec_1, feature_vec_2):
    return cosine_similarity(feature_vec_1.reshape(1, -1), feature_vec_2.reshape(1, -1))[0][0]


### Cosine Similarity (https://paulminogue.com/index.php/2019/09/29/introduction-to-cosine-similarity/)

## Main

### Load data

In [105]:
# load model
model = fasttext.load_model('fp_bigrams_unsupervised.bin')



In [106]:
# Load seed set
text_file = open("seed_set.txt", "r")
no_str = text_file.read()
text_file.close()
# make a list
seed_set = no_str.split("\n")


In [107]:
# Load corpus
text_file2 = open("corpus.txt", "r")
no_str2 = text_file2.read()
text_file2.close()
# make a list
corpus = no_str2.split("\n")

In [108]:
# labels for 10 seed set
Y = [1,1,1,1,1,0,0,0,0,0]

### Nearest Neighbor sets

In [109]:
# nearest neigbors for seed set, setting k = 20
NN_seed_set = []
for comment in seed_set:
    tok_comment = tokenize(comment)
    NN_seed_set.append(NN(model,tok_comment,20))

In [110]:
# nearest neigbors for unlabeled corpus, setting k = 20
NN_corpus = []
for comment in corpus:
    tok_comment = tokenize(comment)
    NN_corpus.append(NN(model,tok_comment,20))

### NN Sampling by finding distance

In [111]:
print(NN_seed_set[0])

[(0.9190638065338135, 'protest'), (0.9121805429458618, 'protests'), (0.8388497829437256, 'farmers'), (0.7637169361114502, 'stop'), (0.7555079460144043, 'propaganda'), (0.7416914701461792, 'protesting'), (0.7286685705184937, 'back'), (0.7275574207305908, 'bill'), (0.7145411968231201, 'farmer'), (0.6960648894309998, 'still'), (0.6925775408744812, 'farm'), (0.6840285062789917, 'law'), (0.680992603302002, 'live'), (0.6618435382843018, '</s>'), (0.6589888334274292, 'jai'), (0.6566498875617981, 'rich'), (0.652950644493103, 'modi'), (0.6461869478225708, 'ram'), (0.6447934508323669, 'name'), (0.6335991621017456, 'terrorists')]


In [112]:
print(NN_corpus[0])

[(0.9034309387207031, 'without'), (0.8403633236885071, 'months'), (0.8322609663009644, 'way'), (0.8184826374053955, 'mandi'), (0.8161060810089111, 'need'), (0.8135474324226379, 'another'), (0.8135462403297424, 'farmers'), (0.7662057876586914, 'middle'), (0.7607018947601318, 'win'), (0.7604532241821289, 'still'), (0.7437332272529602, 'would'), (0.7283453345298767, 'bill'), (0.7243574261665344, 'rights'), (0.7215253710746765, 'fighting'), (0.7155255675315857, 'government'), (0.708303689956665, 'support'), (0.7049741744995117, 'lol'), (0.7008495330810547, 'hope'), (0.6988953948020935, 'due'), (0.6880658864974976, 'man')]
