In [75]:
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.base import BaseEstimator, TransformerMixin
import string
import fasttext
import math
import copy
import re
import pandas as pd
import os
from nltk import TweetTokenizer
from nltk.tokenize.stanford import StanfordTokenizer
from scipy import spatial
from scipy.spatial import distance
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
cachedStopWords = stopwords.words("english")
CLEANR = re.compile('<.*?>')

[nltk_data] Downloading package stopwords to C:\Users\AJAY
[nltk_data]     BISWAS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [76]:
def remove_punctuation(text, punct_list):
    for punc in punct_list:
        if punc in text:
            text = text.replace(punc, ' ')
    return text.strip()


def remove_html_tags(raw_html):
    cleantext = re.sub(CLEANR, '', raw_html)
    return cleantext

def tokenize(sentence, to_lower=True, tknzr=TweetTokenizer()):
    """Arguments:
        - tknzr: a tokenizer implementing the NLTK tokenizer interface
        - sentence: a string to be tokenized
        - to_lower: lowercasing or not
    """
    sentence = sentence.strip()
    sentence = ' '.join([format_token(x) for x in tknzr.tokenize(sentence)])
    if to_lower:
        sentence = sentence.lower()
    # replace urls by <url>
    sentence = re.sub(
        '((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))', '', sentence)
    # replace @user268 by <user>
    sentence = re.sub('(\@[^\s]+)', '', sentence)

    filter(lambda word: ' ' not in word, sentence)

    #remove single letter words
    sentence = ' '.join([w for w in sentence.split() if len(w) > 1])

    sentence = remove_html_tags(sentence)
    sentence = remove_punctuation(sentence, list(string.punctuation))
    sentence = ' '.join([word for word in sentence.split()
                        if word not in cachedStopWords])
    return sentence

def format_token(token):
    """"""
    if token == '-LRB-':
        token = '('
    elif token == '-RRB-':
        token = ')'
    elif token == '-RSB-':
        token = ']'
    elif token == '-LSB-':
        token = '['
    elif token == '-LCB-':
        token = '{'
    elif token == '-RCB-':
        token = '}'
    return token

In [77]:
def cleaned(filename):

    # Load
    text_file = open(filename, "r")
    no_str = text_file.read()
    text_file.close()
    # make a list
    lines = no_str.split("\n")

    #cleanup
    temp = []
    for comment in lines:
        tok_comment = tokenize(comment)
        temp.append(tok_comment)
    lines = temp

    return lines

def load_labels(filename):
    # Load
    text_file = open(filename, "r")
    no_str = text_file.read()
    text_file.close()
    # make a list
    lines = no_str.split("\n")
    return list(map(int, lines))

        

In [78]:
class FastTextTransformer(BaseEstimator, TransformerMixin):
    """ Convert texts into their mean fastText vectors """

    def __init__(self, model):
        self.model = model

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.stack([
            np.mean([self.model[w] for w in text.split()], 0)
            for text in X
        ])

In [79]:
def classify(small_model,predictor,lines,Y):
    classifier = make_pipeline(
        FastTextTransformer(model=small_model),
        predictor
    ).fit(
        lines,
        Y
    )
    return classifier


In [80]:
# load models
model_N_2 = fasttext.load_model('fp_bigrams_unsupervised_N_2.bin')
model_N_3 = fasttext.load_model('fp_bigrams_unsupervised_N_3.bin')



In [81]:
# Load seed set
seed_set = cleaned("seed_set.txt")
# Load seed Labels
Y = load_labels("seed_set_labels.txt")

# Load expanded seed set
seed_set_expanded_N_2 = cleaned("seed_set_expanded_N_2.txt")
Y_N_2 = load_labels("seed_set_expanded_labels_N_2.txt")

seed_set_expanded_N_3 = cleaned("seed_set_expanded_N_3.txt")
Y_N_3 = load_labels("seed_set_expanded_labels_N_3.txt")

In [82]:
# Load testing set
testing = cleaned("testing.txt")

In [83]:
# classification
classifier_N_2_seed_set = classify(model_N_2, LogisticRegression(), seed_set, Y)
classifier_N_3_seed_set = classify(model_N_3,LogisticRegression(),seed_set, Y)
classifier_N_2_expanded_set = classify(model_N_2,LogisticRegression(),seed_set_expanded_N_2, Y_N_2)
classifier_N_3_expanded_set = classify(model_N_3,LogisticRegression(),seed_set_expanded_N_3, Y_N_3)

In [84]:
classifier_N_2_seed_set.predict(testing)

array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 2,
       2, 0, 1, 1, 1, 2, 2, 2])

In [85]:
classifier_N_3_seed_set.predict(testing)

array([1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 2,
       2, 1, 1, 1, 1, 2, 2, 2])

In [86]:
classifier_N_2_expanded_set.predict(testing)

array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 2,
       2, 0, 1, 1, 1, 2, 2, 2])

In [87]:
classifier_N_3_expanded_set.predict(testing)

array([1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,
       2, 1, 1, 1, 1, 0, 2, 2])