In [389]:
import warnings
warnings.filterwarnings('ignore')
import multiprocessing
from gensim.models import Word2Vec
import numpy as np
from os import listdir
from os.path import isfile, join

## Gensim GloVe word vectors

In [390]:
from gensim.scripts.glove2word2vec import glove2word2vec
glove2word2vec(glove_input_file="glove.twitter.27B/glove.twitter.27B.25d.txt",
               word2vec_output_file="gensim_glove_vectors50.txt")

(1193514, 25)

In [391]:
from gensim.models.keyedvectors import KeyedVectors
glove_model = KeyedVectors.load_word2vec_format("gensim_glove_vectors50.txt", binary=False)

In [450]:
glove_model.wv.similar_by_word('hashtag')

[('twitter', 0.8885936141014099),
 ('instagram', 0.8861387968063354),
 ('tweet', 0.8799343109130859),
 ('tweets', 0.8788758516311646),
 ('youtube', 0.8644436597824097),
 ('facebook', 0.8608332276344299),
 ('snapchat', 0.8595442175865173),
 ('chat', 0.8593137860298157),
 ('insta', 0.8528122305870056),
 ('spam', 0.8513364791870117)]

In [393]:
vectors = glove_model.wv

## Model class

In [394]:
import os
import sys
import itertools
from random import random
import csv
from collections import defaultdict
from sklearn.svm import SVC
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import accuracy_score

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [496]:
class SupervisedExpRunner(object):
    def __init__(self):
        self.model = None

    def _create_classifier(self):
        self.model = AdaBoostClassifier(n_estimators=50)

    def _fit(self, X, y):
        if self.model is None:
            self._create_classifier()

        self.model.fit(X, y)
        
    def _separate_data(self, X, y):
        X_win = X[y == 2]
        X_top10 = X[y == 1]
        X_rest = X[(y != 1) & (y != 2)]
        return X_win, X_top10, X_rest
        
    def _evaluate(self, X, y):
        np.set_printoptions(threshold=sys.maxsize)
        y_pred = self.model.predict(X)
#         print(y_pred)
#         print(y)
        acc = accuracy_score(y, y_pred)
        self.results = {'accuracy': acc}
        
    def _create_pairwise_data(self, Xs, ys):
        X_pairs = []
        y_pairs = []
        for X, y in zip(Xs, ys):
            X_win, X_top10, X_rest = self._separate_data(X, [y])

            for tweet_pair in itertools.product(X_win, X_top10):
                if random() > 0.5:
                    tweet_data = np.hstack((tweet_pair[0], tweet_pair[1]))
                    tweet_label = 1
                else:
                    tweet_data = np.hstack((tweet_pair[1], tweet_pair[0]))
                    tweet_label = 0

                X_pairs.append(tweet_data)
                y_pairs.append(tweet_label)

            for tweet_pair in itertools.product(X_top10, X_rest):
                if random() > 0.5:
                    tweet_data = np.hstack((tweet_pair[0], tweet_pair[1]))
                    tweet_label = 1
                else:
                    tweet_data = np.hstack((tweet_pair[1], tweet_pair[0]))
                    tweet_label = 0

                X_pairs.append(tweet_data)
                y_pairs.append(tweet_label)

            for tweet_pair in itertools.product(X_win, X_rest):
                if random() > 0.5:
                    tweet_data = np.hstack((tweet_pair[0], tweet_pair[1]))
                    tweet_label = 1
                else:
                    tweet_data = np.hstack((tweet_pair[1], tweet_pair[0]))
                    tweet_label = 0

                X_pairs.append(tweet_data)
                y_pairs.append(tweet_label)
                
        X = np.vstack(X_pairs)
        y = np.array(y_pairs)

        return X, y

    def get_results(self):
        return self.results
    
    def run_loo_exp(self, Xs, ys, ht_list, ow_name = 'results'):
        out_file = open(ow_name+'.csv', 'w')
        ow = csv.writer( out_file )
        micro_sum = 0
        total_pairs = 0
        num_hts = len(ys)
        for i in range(num_hts):
            print(str(100*i/num_hts)+'% done')
            
#             X_test = Xs[i]
#             y_test = ys[i]
#             X_train = [*itertools.chain.from_iterable(Xs[:i] + Xs[i + 1:])]
#             y_train = [*itertools.chain.from_iterable(ys[:i] + ys[i + 1:])]

            Xs_test = [Xs[i]]
            ys_test = [ys[i]]
            Xs_train = Xs[:i] + Xs[i+1:]
            ys_train = ys[:i] + ys[i+1:]
            
            X_train, y_train = self._create_pairwise_data(Xs_train, ys_train)
            X_test, y_test = self._create_pairwise_data(Xs_test, ys_test)

            self._fit(X_train, y_train)
            self._evaluate(X_test, y_test)
            ht_result = self.get_results()
            ow.writerow([ht_list[i], str(ht_result['accuracy'])])
            print(ht_result)
            micro_sum += ht_result['accuracy'] * y_test.shape[0]
            total_pairs += y_test.shape[0]
            self.model = None
            
        out_file.close()
        print('100% done')
        return micro_sum / total_pairs

## Functions

In [396]:
def create_data(data_location):
    ht_files = sorted(os.listdir(data_location))

    Xs = []
    ys = []
    ht_list = []
    for htf in ht_files:
        ht_dict = load_hashtag(data_location,htf)

        ht_list.append(htf)
        ys.append(ht_dict['Y'])
        Xs.append(ht_dict['X_bow'])

    return Xs, ys, ht_list

In [397]:
def load_hashtag(data_location, htf):
    tweets = []
    labels = []
    for line in open(os.path.join(data_location,htf)).readlines():
        line_split = line.strip().split('\t')
        tweets.append(line_split[1])
        labels.append(int(line_split[2]))

    Y = np.array(labels)
    X_bow = [create_glove_sent(tweet) for tweet in tweets]

    return {'X_bow':X_bow,'Y':Y}

In [398]:
def create_bow_rep( in_tweet ):
    bow_map = defaultdict(int)
    tokens = in_tweet.split()
    for tok in tokens:
        bow_map[tok.lower()] += 1
    return bow_map

In [399]:
def create_glove_sent( in_tweet ):
    tokens = in_tweet.split()
    
    arr = np.zeros(25)
    
    for tok in tokens:
        try:
            arr += vectors[tok]
        except:
            pass
    return arr

## Main

In [530]:
Xs, ys, ht_list = create_data('trial_data')

exp_runner = SupervisedExpRunner()
outwriter_name = 'results_sem'
results = exp_runner.run_loo_exp(Xs, ys, ht_list, outwriter_name)
print('micro-avergae accuracy:',results)

0.0% done
{'accuracy': 0.5376}
20.0% done
{'accuracy': 0.6218666666666667}
40.0% done
{'accuracy': 0.5781333333333334}
60.0% done
{'accuracy': 0.6058666666666667}
80.0% done
{'accuracy': 0.4928}
100% done
micro-avergae accuracy: 0.5672533333333334
