In [477]:
import warnings
warnings.filterwarnings('ignore')
import multiprocessing
from gensim.models import Word2Vec
import numpy as np
import os

## Gensim GloVe word vectors

In [2]:
from gensim.scripts.glove2word2vec import glove2word2vec
glove2word2vec(glove_input_file="glove.twitter.27B/glove.twitter.27B.25d.txt",
               word2vec_output_file="gensim_glove_vectors25.txt")

(1193514, 25)

In [26]:
from gensim.models.keyedvectors import KeyedVectors
glove_model = KeyedVectors.load_word2vec_format("gensim_glove_vectors25.txt", binary=False)
vectors = glove_model.wv

In [482]:
from gensim.scripts.glove2word2vec import glove2word2vec
glove2word2vec(glove_input_file="char-embeddings.txt",
               word2vec_output_file="gensim_char-embeddings.txt")

from gensim.models.keyedvectors import KeyedVectors
char_embeddings = KeyedVectors.load_word2vec_format("gensim_char-embeddings.txt", binary=False)
char_vectors = char_embeddings.wv

In [27]:
glove_model.wv.similar_by_word('nigga')

[('lil', 0.9541467428207397),
 ('bitch', 0.9520374536514282),
 ('mf', 0.9506527781486511),
 ('bruh', 0.9500958919525146),
 ('dawg', 0.9483038187026978),
 ('fuckin', 0.9452493786811829),
 ('shawty', 0.9435693025588989),
 ('ass', 0.9413027763366699),
 ('cuz', 0.9359691143035889),
 ('shit', 0.9340022802352905)]

In [28]:
vectors.similarity('nigga', 'alen')

0.02522685168689944

## Models

In [29]:
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectFromModel
import itertools
from sklearn.ensemble import AdaBoostClassifier

In [30]:
def separate_data(X, y):
        X_win = X[y == 2]
        X_top10 = X[y == 1]
        X_rest = X[(y != 1) & (y != 2)]
        return X_win, X_top10, X_rest
    
def create_pairwise_data(Xs, ys):
        X_pairs = []
        y_pairs = []
        for X, y in zip(Xs, ys):
            X_win, X_top10, X_rest = separate_data(X, [y])

            for tweet_pair in itertools.product(X_win, X_top10):
                if random() > 0.5:
                    tweet_data = np.hstack((tweet_pair[0], tweet_pair[1]))
                    tweet_label = 1
                else:
                    tweet_data = np.hstack((tweet_pair[1], tweet_pair[0]))
                    tweet_label = 0

                X_pairs.append(tweet_data)
                y_pairs.append(tweet_label)

            for tweet_pair in itertools.product(X_top10, X_rest):
                if random() > 0.5:
                    tweet_data = np.hstack((tweet_pair[0], tweet_pair[1]))
                    tweet_label = 1
                else:
                    tweet_data = np.hstack((tweet_pair[1], tweet_pair[0]))
                    tweet_label = 0

                X_pairs.append(tweet_data)
                y_pairs.append(tweet_label)

            for tweet_pair in itertools.product(X_win, X_rest):
                if random() > 0.5:
                    tweet_data = np.hstack((tweet_pair[0], tweet_pair[1]))
                    tweet_label = 1
                else:
                    tweet_data = np.hstack((tweet_pair[1], tweet_pair[0]))
                    tweet_label = 0

                X_pairs.append(tweet_data)
                y_pairs.append(tweet_label)

        X = np.vstack(X_pairs)
        y = np.array(y_pairs)

        return X, y

In [31]:
class SelectionModel(object):
    def __init__(self):
        self.model = None

    def _create_classifier(self):
        self.model = Pipeline([
                      ('feature_selection', SelectFromModel(LinearSVC(penalty="l1"))),
                      ('classification', RandomForestClassifier())
                    ])
        self.model = LogisticRegression()

    def _fit(self, X, y):
        if self.model is None:
            self._create_classifier()
        self.model.fit(X, y)
        
    def _evaluate(self, X, y):
#         np.set_printoptions(threshold=sys.maxsize)
        y_pred = self.model.predict(X)
#         print(y)
#         print(y_pred)
        acc = accuracy_score(y, y_pred)
        self.results = {'accuracy': acc}
        
    def get_results(self):
        return self.results
    
    def run(self, Xs, ys, ht_list, ow_name = 'results'):
        num_hts = len(ys)
        for i in range(num_hts):
            print(str(100*i/num_hts)+'% done')
            
            X_test = Xs[i]
            y_test = ys[i]
            X_train = [*itertools.chain.from_iterable(Xs[:i] + Xs[i + 1:])]
            y_train = [*itertools.chain.from_iterable(ys[:i] + ys[i + 1:])]

            self._fit(X_train, y_train)
            self._evaluate(X_test, y_test)
            ht_result = self.get_results()
            print(ht_result)
            self.model = None
            
        print('100% done')

## XGBoost

In [523]:
os.environ['KMP_DUPLICATE_LIB_OK']='True'
from xgboost import XGBClassifier
from xgboost import plot_importance
from matplotlib import pyplot
from random import random

class XGBoost(object):
    def __init__(self):
        self.model = None
        self.acc = 0
        self.counter = 0

    def _create_classifier(self):
        self.model = XGBClassifier()
        
    def _fit(self, X, y):
        if self.model is None:
            self._create_classifier()
        self.model.fit(X, y)
        
#         plot_importance(self.model)
#         pyplot.show()
#         print(self.model.get_booster().get_score(importance_type = 'total_gain'))
        
    def _evaluate_pairwise(self, X, y):
        X_pairs = []
        y_pairs = []
        X_win, X_top10, X_rest = separate_data(X, y)
        
        count_corect = 0
        count_total = 0
        
        for tweet_pair in itertools.product(X_top10, X_rest):
            count_total += 1
            
            win_zeroclass_prob = self.model.predict_proba([tweet_pair[0]])[:,0]
            rest_zeroclass_prob = self.model.predict_proba([tweet_pair[1]])[:,0]
            
            if win_zeroclass_prob < rest_zeroclass_prob:
                count_corect += 1
                
        for tweet_pair in itertools.product(X_win, X_rest):
            count_total += 1
            
            win_zeroclass_prob = self.model.predict_proba([tweet_pair[0]])[:,0]
            rest_zeroclass_prob = self.model.predict_proba([tweet_pair[1]])[:,0]
            
            if win_zeroclass_prob < rest_zeroclass_prob:
                count_corect += 1
                
        for tweet_pair in itertools.product(X_win, X_top10):
            count_total += 1
            
            win_zeroclass_prob = self.model.predict_proba([tweet_pair[0]])[:,0]
            rest_zeroclass_prob = self.model.predict_proba([tweet_pair[1]])[:,0]
            
            if win_zeroclass_prob < rest_zeroclass_prob:
                count_corect += 1
                
        accuracy = count_corect / float(count_total)
                
        self.counter += 1
        self.acc += accuracy
                
        print('pairwise_accuracy', accuracy)
        
    def _evaluate(self, X, y):
#         np.set_printoptions(threshold=sys.maxsize)
        self._evaluate_pairwise(X, y)
#         y_pred = self.model.predict(X)
#         acc = accuracy_score(y, y_pred)
#         self.results = {'accuracy': acc}
        
    def get_results(self):
        return self.results
    
    def run(self, Xs, ys, ht_list):
        num_hts = len(ys)
        for i in range(num_hts):
            print(str(i / float(num_hts))+'% done')
            print(ht_list[i])
            
            X_test = np.array(Xs[i])
            y_test = np.array(ys[i])
            X_train = np.array([*itertools.chain.from_iterable(Xs[:i] + Xs[i + 1:])])
            y_train = np.array([*itertools.chain.from_iterable(ys[:i] + ys[i + 1:])])
            
#             y_test[y_test == 2] = 1
#             y_train[y_train == 2] = 1

            self._fit(X_train, y_train)
            self._evaluate(X_test, y_test)
#             ht_result = self.get_results()
#             print(ht_result)
            self.model = None
            
        print('100% done')
        print('Total acc: {}'.format(self.acc / float(self.counter)))

## Feature extraction

In [57]:
def averageSimilarity(tokens):
    counter = 0; suma = 0
    
    for w1, w2 in itertools.combinations(tokens, 2):
        w1 = w1.lower(); w2 = w2.lower()
        if w1 not in vectors.vocab or w2 not in vectors.vocab:
            continue
        suma += vectors.similarity(w1, w2)
        counter += 1
        
    if counter != 0:
        return suma /  counter
    else:
        return 0

In [275]:
def extractFeatures(tokens):
    
    words_num = len(tokens)
    chars_num = np.sum(np.array([len(token) for token in tokens]))
    unk_num = np.sum(np.array([1 for token in tokens if token.lower() not in vectors.vocab]))
    mean_words_len = chars_num / float(words_num)
    caps_num = np.sum(np.array([1 for token in tokens if token.isupper()]))
    average_similarity = averageSimilarity(tokens)
    
    return[words_num, chars_num, unk_num, caps_num, average_similarity, mean_words_len]

## Preprocessing

In [204]:
from nltk.tokenize import TweetTokenizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [35]:
def removeHashtag(arr, htf):
    regex = ('#' + htf.replace('_', '')).split('.tsv')[0]
    if regex in arr:
        arr.remove(regex)
    elif regex.lower() in arr:
        arr.remove(regex.lower())

In [36]:
import re

def convertHashtag(arr, htf):
    regex = ('#' + htf.replace('_', '')).split('.tsv')[0]
    text = re.findall('[A-Z][^A-Z]*', (htf.replace('_', '')).split('.tsv')[0])
    
    if regex in arr:
        arr.remove(regex)
        arr.extend(text)

In [551]:
def preprocessTweet(tweet, htf):
    tknzr = TweetTokenizer()
    tokens = tknzr.tokenize(tweet)
    
    if '@midnight' in tokens: 
        tokens.remove('@midnight')
        
    removeHashtag(tokens, htf)
    
    return np.append(glove_char_embeddings(tokens), extractFeatures(tokens))
#     return np.array(extractFeatures(tokens))
#     return np.array(glove_char_embeddings(tokens))

In [223]:
def glove_vects( tokens ):
    
    arr = np.zeros(25)
    
    for tok in tokens:
        try:
            arr += vectors[tok]
        except:
            pass
        
    return np.array(arr)

In [441]:
def char_embeddings(tokens):
    
    tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')
    tk.fit_on_texts(tokens)

    # -----------------------Skip part start--------------------------
    alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
    char_dict = {}
    for i, char in enumerate(alphabet):
        char_dict[char] = i + 1

    tk.word_index = char_dict.copy()
    tk.word_index[tk.oov_token] = max(char_dict.values()) + 1
    # -----------------------Skip part end----------------------------

    # Convert string to index
    sequences = tk.texts_to_sequences(tokens)

    # Padding
    input_size = 30
    data = pad_sequences(sequences, maxlen=input_size, padding='post')
#     return np.array(data, dtype='float32')

    # Convert to numpy array
    sentence = np.zeros(30)
    for word in data:
        sentence += np.array(word)
        
    return sentence

In [498]:
def glove_char_embeddings(tokens):
    arr = np.zeros(300)
    
    for word in tokens:
        for char in word:
            try:
                arr += char_vectors[char]
            except:
                pass
        
    return np.array(arr)

## Main

def load_document_chars(data_location, htf):
    tweets = []
    labels = []
    
    for line in open(os.path.join(data_location, htf)).readlines():
        line_split = line.strip().split('\t')
        tweets.append(line_split[1])
        labels.append(int(line_split[2]))

    Y = np.array(labels)
    X_vects = char_embeddings(tweets)

    return {'X_vects': X_vects, 'Y': Y}

In [315]:
def load_document(data_location, htf):
    tweets = []
    labels = []
    
    for line in open(os.path.join(data_location, htf)).readlines():
        line_split = line.strip().split('\t')
        tweets.append(line_split[1])
        labels.append(int(line_split[2]))

    Y = np.array(labels)
    X_vects = [preprocessTweet(tweet, htf) for tweet in tweets]

    return {'X_vects': X_vects, 'Y': Y}

In [440]:
def create_data(data_location):
    ht_files = sorted(os.listdir(data_location))

    Xs = []
    ys = []
    ht_list = []
    for htf in ht_files:
        ht_dict = load_document(data_location, htf)

        ht_list.append(htf)
        ys.append(ht_dict['Y'])
        Xs.append(ht_dict['X_vects'])

    return Xs, ys, ht_list

Xs, ys, ht_list = create_data('train_data')

pairs_num = 0; bigger_first = 0

for X, y in zip(Xs, ys):
    X_win, X_top10, X_rest = separate_data(np.array(X)[:, 29], y)
    
    for tweet_pair in itertools.product(X_top10, X_rest):
        
        if tweet_pair[0] == 0:
            continue
            
        pairs_num += 1
        if tweet_pair[0] < tweet_pair[1]:
            bigger_first += 1

print(bigger_first / pairs_num)

In [552]:
Xs, ys, ht_list = create_data('semi_data')
clf = XGBoost().run(Xs, ys, ht_list)

0.0% done
Canadian_Superheroes.tsv
pairwise_accuracy 0.621683093252464
0.06666666666666667% done
Comic_Book_Songs.tsv
pairwise_accuracy 0.5597964376590331
0.13333333333333333% done
Elderly_Movies.tsv
pairwise_accuracy 0.7243642329778507
0.2% done
Florida_A_Movie.tsv
pairwise_accuracy 0.6301040832666133
0.26666666666666666% done
Got_Fired_Because.tsv
pairwise_accuracy 0.6345609065155807
0.3333333333333333% done
Gritty_Seuss.tsv
pairwise_accuracy 0.5789473684210527
0.4% done
Hangover_Songs.tsv
pairwise_accuracy 0.5738858483189992
0.4666666666666667% done
Hip_Hop_Star_Wars.tsv
pairwise_accuracy 0.5987488829311886
0.5333333333333333% done
Holiday_Celebs.tsv
pairwise_accuracy 0.4844995571302037
0.6% done
Make_Baseball_Exciting.tsv
pairwise_accuracy 0.6318147871545929
0.6666666666666666% done
Nerd_Broadway.tsv
pairwise_accuracy 0.5295646523716699
0.7333333333333333% done
One_Letter_Off_Foods.tsv
pairwise_accuracy 0.5374310480693459
0.8% done
Political_Superheroes.tsv
pairwise_accuracy 0.8859