In [None]:
# !pip install ekphrasis
# !pip install xgboost
# !pip install textblob
# !pip install keras
# !pip install tensorflow
# !pip install gensim==3.8.3
# !pip install scikit-multilearn
# !pip install emoji

In [None]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas as pd
import numpy as np
import emoji
import xgboost, textblob, string, ekphrasis, nltk, re

#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('wordnet')

from keras.preprocessing import text, sequence
from keras import layers, models, optimizers
from xgboost.sklearn import XGBClassifier

from sklearn.svm import NuSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate

from ekphrasis.classes.spellcorrect import SpellCorrector
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

import gensim
from gensim.models import Word2Vec

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
from gensim.models.doc2vec import TaggedDocument

sp = SpellCorrector(corpus="english") 

In [None]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, ret_preds = False, 
                ret_probas = False, ret_all = False, is_neural_net=False, probs = True, cross_val = False, valid = True):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if valid:
        if cross_val:
            f1 = cross_val_scores(classifier, feature_vector_train)
        else:
            f1 = metrics.f1_score(valid_y[col], predictions)
    
    if probs:
        probas = [i[1] for i in classifier.predict_proba(feature_vector_valid)]
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    if ret_preds:
        return predictions
    elif ret_probas:
        return probas
    elif ret_all:
        return (f1, metrics.confusion_matrix(valid_y[col], predictions), predictions, probas)
    else:
        return (f1, metrics.confusion_matrix(valid_y[col], predictions))

def get_class(c, cutoff):
    return [1 if i>cutoff else 0 for i in c]

In [None]:
def cross_val_scores(model, x, verbose=False):
    scores = cross_validate(model, x, train_y[col], cv=10, scoring=('f1', 'accuracy'), return_train_score=True)
    if verbose:
        print('------------------------------------')
        print(type(model).__name__)
        print('Test f1: {}'.format(np.mean(scores['test_f1'])))
        print('Train f1: {}'.format(np.mean(scores['train_f1'])))
        print('Test accuracy: {}'.format(np.mean(scores['test_accuracy'])))
        print('Train accuracy: {}'.format(np.mean(scores['train_accuracy'])))
    
    return np.mean(scores['test_f1'])

In [None]:
tweets = pd.read_csv('Data/annotated_tweets_data.csv', index_col=0)
tweets_to_train = pd.read_csv('Data/tweets_to_predict.csv')
answers = pd.read_csv('Data/annotated_tweets.csv', delimiter=';', index_col=0)

## Part 1: Text preprocessing

* Select features
* Spell correction
* Normalize text
* Extract domains from url

In [None]:
to_keep = ['status_id', 'created_at', 'text', 'favorite_count', 'retweet_count']
tweets = tweets[to_keep]
tweets = tweets.merge(answers, on='status_id', how='inner')
tweets.info()

In [None]:
keywords = pd.read_excel("Data/keywords.xlsx")

politics = list(keywords[keywords.Politics.isna()==False].Politics)
travel = list(keywords[keywords.Travel.isna()==False].Travel)
personal = list(keywords[keywords['Personal impact'].isna()==False]['Personal impact'])
health = list(keywords[keywords.Health.isna()==False].Health)
economic = list(keywords[keywords['Economic impact'].isna()==False]['Economic impact'])

In [None]:
text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 
        'time', 'date', 'number'],
    
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=True,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

tweets['corrected_text'] = [" ".join(text_processor.pre_process_doc(s)) for s in tweets.text]

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('punkt')

In [None]:
my_stopwords = nltk.corpus.stopwords.words('english')
word_rooter = nltk.stem.snowball.PorterStemmer(ignore_stopwords=False).stem #stemmer
lem = nltk.stem.WordNetLemmatizer().lemmatize #lemmatizer
my_punctuation = '!"$%&\'()*+,-./:;=?[\\]^_`{|}~â€¢'


def get_word_and_tag(tokens):
    tagged = pos_tag(tokens)
    
    cleaned_tags = []
    for word, tag in tagged:
        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        cleaned_tags.append((word,pos))
    return cleaned_tags

def clean_tweet_word2vec(tweet, bigrams=False):
    tweet = tweet.lower() # lower case
    tweet = emoji.demojize(tweet)
    tweet = re.sub('['+my_punctuation + ']+', ' ', tweet) # strip punctuation
    tweet = re.sub('\s+', ' ', tweet) #remove double spacing
    tweet = re.sub('([0-9]+)', '', tweet) # remove numbers
    tweet_token_list = [word for word in tweet.split(' ') if word not in my_stopwords] # remove stopwords
    tweet_token_list = [word for word in tweet_token_list if len(word)>0]
    
    tweet_token_list = [lem(word,tag) for word,tag in get_word_and_tag(tweet_token_list)] # apply word rooter
    if bigrams:
        tweet_token_list = tweet_token_list+[tweet_token_list[i]+'_'+tweet_token_list[i+1]
                                            for i in range(len(tweet_token_list)-1)]
    tweet = ' '.join(tweet_token_list)
    return tweet

tweets['corrected_text_w2v'] = tweets['corrected_text'].apply(clean_tweet_word2vec, bigrams=True)

In [None]:
# NLP features
tweets['char_count'] = tweets['corrected_text'].apply(len)
tweets['word_count'] = tweets['corrected_text'].apply(lambda x: len(x.split()))
tweets['word_density'] = tweets['char_count'] / (tweets['word_count']+1)
tweets['punctuation_count'] = tweets['text'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
tweets['title_word_count'] = tweets['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
tweets['upper_case_word_count'] = tweets['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))

In [None]:
kw_cols = ['Politics_kw', 'Health_kw', 'Personal_kw', 'Economic_kw', 'Travel_kw'] + ['Politics_perc', 'Health_perc', 'Personal_perc', 'Economic_perc', 'Travel_perc']

for i, row in tweets.iterrows():
    text = row['text'].lower()
    for l, dim, perc in zip([politics, health, personal, economic, travel],['Politics_kw', 'Health_kw', 'Personal_kw', 'Economic_kw', 'Travel_kw'], ['Politics_perc', 'Health_perc', 'Personal_perc', 'Economic_perc', 'Travel_perc']):
        k = 0
        for j in l:
            k += text.count(j.lower())
        tweets.loc[i,dim] = k
        tweets.loc[i,perc] = k/row['word_count']

## Read not annotated tweets

In [None]:
tweets_to_train['corrected_text'] = [" ".join(text_processor.pre_process_doc(s)) for s in tweets_to_train.text]
tweets_to_train['corrected_text_w2v'] = tweets_to_train['corrected_text'].apply(clean_tweet_word2vec)

In [None]:
# NLP features
tweets_to_train['char_count'] = tweets_to_train['corrected_text'].apply(len)
tweets_to_train['word_count'] = tweets_to_train['corrected_text'].apply(lambda x: len(x.split()))
tweets_to_train['word_density'] = tweets_to_train['char_count'] / (tweets_to_train['word_count']+1)
tweets_to_train['punctuation_count'] = tweets_to_train['text'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
tweets_to_train['title_word_count'] = tweets_to_train['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
tweets_to_train['upper_case_word_count'] = tweets_to_train['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))

In [None]:
for i, row in tweets_to_train.iterrows():
    text = row['text'].lower()
    for l, dim, perc in zip([politics, health, personal, economic, travel],['Politics_kw', 'Health_kw', 'Personal_kw', 'Economic_kw', 'Travel_kw'], ['Politics_perc', 'Health_perc', 'Personal_perc', 'Economic_perc', 'Travel_perc']):
        k = 0
        for j in l:
            k += text.count(j.lower())
        tweets_to_train.loc[i,dim] = k
        tweets_to_train.loc[i,perc] = k/row['word_count']

In [None]:
import random
random.seed(1)

cols = ['Politics_kw', 'Health_kw', 'Personal_kw', 'Economic_kw', 'Travel_kw', 'char_count', 
        'word_count', 'word_density','punctuation_count', 'title_word_count', 'upper_case_word_count', 
        'Politics_perc','Health_perc', 'Personal_perc', 'Economic_perc', 'Travel_perc', 'text', 'corrected_text', 
       'corrected_text_w2v']
categories = ['Politics', 'Health', 'Economic', 'Personal', 'Travel'] 

train_x, valid_x, train_y, valid_y = model_selection.train_test_split(tweets[cols], tweets[categories], 
                                                                      test_size = 0.2, random_state=2)

## Feature engineering

* Count Vectors as features
* TF-IDF Vectors as features
    * Word level
    * N-Gram level
    * Character level (useless)
* Word Embeddings as features
* Text / NLP based features
* Dimensionality reduction topic models
* Word embeddings
    * Word2Vec
    * Doc2Vec

In [None]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
count_vect.fit(tweets_to_train.corrected_text_w2v)

x_count = count_vect.transform(tweets_to_train.corrected_text_w2v)
xtrain_count =  count_vect.transform(train_x.corrected_text_w2v)
xvalid_count =  count_vect.transform(valid_x.corrected_text_w2v)

In [None]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(tweets_to_train.corrected_text)

x_tfidf = tfidf_vect.transform(tweets_to_train.corrected_text)
xtrain_tfidf =  tfidf_vect.transform(train_x.corrected_text)
xvalid_tfidf =  tfidf_vect.transform(valid_x.corrected_text)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(tweets_to_train.corrected_text)

x_tfidf_ngram = tfidf_vect_ngram.transform(tweets_to_train.corrected_text)
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x.corrected_text)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x.corrected_text)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(tweets_to_train.corrected_text)

x_tfidf_ngram_chars = tfidf_vect_ngram_chars.transform(tweets_to_train.corrected_text)
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x.corrected_text) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x.corrected_text)

### Word2Vec

In [None]:
train_tokenized = [word_tokenize(i) for i in train_x.corrected_text_w2v]
valid_tokenized = [word_tokenize(i) for i in valid_x.corrected_text_w2v]

all_text = [word_tokenize(i) for i in tweets_to_train.corrected_text_w2v]

all_text += train_tokenized+valid_tokenized

In [None]:
w2v_model = Word2Vec(min_count=10,window=5, sample=6e-5, alpha=0.03,  min_alpha=0.0007, negative=20)

w2v_model.build_vocab(all_text)

w2v_model.train(all_text, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

In [None]:
def word_averaging(wv, words):
    all_words, mean = set(), []
    
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.wv.vocab:
            mean.append(wv.wv.syn0[wv.wv.vocab[word].index])
            all_words.add(wv.wv.vocab[word].index)

    if not mean:
        #logging.warning("cannot compute similarity with no input %s", words)
        # FIXME: remove these examples in pre-processing
        return np.zeros(wv.wv.vector_size,)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def  word_averaging_list(wv, text_list):
    return np.vstack([word_averaging(wv, post) for post in text_list ])

def w2v_tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text, language='english'):
        for word in nltk.word_tokenize(sent, language='english'):
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens

In [None]:
test_tokenized = valid_x.apply(lambda r: w2v_tokenize_text(r['corrected_text_w2v']), axis=1).values
train_tokenized = train_x.apply(lambda r: w2v_tokenize_text(r['corrected_text_w2v']), axis=1).values
new_tokenized = tweets_to_train.apply(lambda r: w2v_tokenize_text(r['corrected_text_w2v']), axis=1).values

X_train_word_average = word_averaging_list(w2v_model,train_tokenized)
X_valid_word_average = word_averaging_list(w2v_model,test_tokenized)
new_word_average = word_averaging_list(w2v_model, new_tokenized)

### Doc2Vec

In [None]:
def label_sentences(corpus, label_type):
    labeled = []
    for i, v in enumerate(corpus):
        label = label_type + '_' + str(i)
        labeled.append(TaggedDocument(v.split(), [label]))
    return labeled

train_sentences = label_sentences(train_x.corrected_text_w2v, 'Train')
valid_sentences = label_sentences(valid_x.corrected_text_w2v, 'Test')
new_sentences = label_sentences(tweets_to_train.corrected_text_w2v, 'New')
all_data = new_sentences + train_sentences + valid_sentences

In [None]:
def get_vectors(model, corpus_size, vectors_size, vectors_type):
    vectors = np.zeros((corpus_size, vectors_size))
    for i in range(0, corpus_size):
        prefix = vectors_type + '_' + str(i)
        vectors[i] = model.docvecs[prefix]
    return vectors

def tune_doc2vec():
    best_f1 = 0
    for vector_size in [100,200,300,400,500,600]:
        for negative in [5,10]:
            model_dbow = Doc2Vec(dm=0, vector_size=vector_size, negative=negative, min_count=1, alpha=0.065, min_alpha=0.065)
            model_dbow.build_vocab([x for x in tqdm(all_data)])

            for epoch in range(30):
                model_dbow.train(utils.shuffle([x for x in tqdm(all_data)]), total_examples=len(all_data), epochs=1)
                model_dbow.alpha -= 0.002
                model_dbow.min_alpha = model_dbow.alpha

            train_temp = get_vectors(model_dbow, len(train_sentences), vector_size, 'Train')
            test_temp = get_vectors(model_dbow, len(valid_sentences), vector_size, 'Test')

            accuracy = train_model(linear_model.LogisticRegression(), train_temp, train_y[col], test_temp, probs=False)
            if accuracy[0]>best_f1:
                best_f1 = accuracy[0]
                train_vectors_dbow = train_temp
                test_vectors_dbow = test_temp

    return train_vectors_dbow, test_vectors_dbow

def get_doc2vec(vector_size, negative):
    model_dbow = Doc2Vec(dm=0, vector_size=vector_size, negative=negative, min_count=1, alpha=0.065, min_alpha=0.065)
    model_dbow.build_vocab([x for x in tqdm(all_data)])

    for epoch in range(30):
        model_dbow.train(utils.shuffle([x for x in tqdm(all_data)]), total_examples=len(all_data), epochs=1)
        model_dbow.alpha -= 0.002
        model_dbow.min_alpha = model_dbow.alpha

    train_temp = get_vectors(model_dbow, len(train_sentences), vector_size, 'Train')
    test_temp = get_vectors(model_dbow, len(valid_sentences), vector_size, 'Test')
    new_temp = get_vectors(model_dbow, len(new_sentences), vector_size, 'New')

    return train_temp, test_temp, new_temp

#train_vectors_dbow, test_vectors_dbow = tune_doc2vec()
train_vectors_dbow, test_vectors_dbow, new_vectors_dbow = get_doc2vec(300,5)

### Dimensionality reduction

In [None]:
# train a LDA Model
def get_lda_topics(tweets, n, features = 'count', purpose = 'test'):
    x_train_count =  xtrain_count
    x_valid_count =  xvalid_count
    count = x_count
    if features == 'tfidf':
        x_train_count =  xtrain_tfidf
        x_valid_count =  xvalid_tfidf
        count = x_tfidf
    elif features == 'ngram':
        x_train_count =  xtrain_tfidf_ngram
        x_valid_count =  xvalid_tfidf_ngram
        count = x_tfidf_ngram
    elif features == 'char':
        x_train_count =  xtrain_tfidf_ngram_chars
        x_valid_count =  xvalid_tfidf_ngram_chars
        count = x_tfidf_ngram_chars

    lda_model = decomposition.LatentDirichletAllocation(n_components=n, learning_method='online', max_iter=20)
    lda_model.fit(count)

    train_topics = lda_model.transform(x_train_count)
    valid_topics = lda_model.transform(x_valid_count)
    new_topics = lda_model.transform(count)
    
    if purpose == 'test':
        accuracy = train_model(linear_model.SGDClassifier(alpha=0.00001), train_topics, train_y[col], valid_topics, probs=False)
        return accuracy[0]
    
    else:
        return train_topics, valid_topics, new_topics

def tune_lda():
    best_features = ''
    best_n = 0
    best_f1 = 0
    for features in ['count', 'tfidf', 'ngram', 'char']:
        for n in [15,30,45,60]:
            f1 = get_lda_topics(tweets, n, features = features, purpose = 'test')
            if f1 > best_f1:
                best_n = n
                best_features = features
                best_f1 = f1
    return best_n, best_features

## Model building

In [None]:
def tune_lr(train, test):
    best_f1 = 0
    best_c = 1
    best_weight = 1
    for w in [0.3,0.5,0.7,0.9]:
        for c in np.linspace(0.01,250,100):
            lr = linear_model.LogisticRegression(C=c, random_state=0, solver = 'lbfgs', max_iter=1000, class_weight={0:w,1:1})
            f1 = train_model(lr, train, train_y[col], test, probs=False)[0]
            if f1>best_f1:
                best_f1 = f1
                best_c = c
                best_weight = w
    print('Logistic Regression: {}, {}'.format(best_c, best_f1))
    return linear_model.LogisticRegression(C = best_c, random_state=0, class_weight={0:best_weight,1:1}), best_f1

def tune_rc(train, test):
    best_f1 = 0
    best_alpha = 1
    best_weight = 1
    for w in [0.3,0.5,0.7,0.9]:
        for alpha in np.linspace(0.001,50,100):
            lr = linear_model.RidgeClassifier(alpha = alpha, random_state=0, class_weight={0:w,1:1})
            f1 = train_model(lr, train, train_y[col], test, probs=False)[0]
            if f1>best_f1:
                best_f1 = f1
                best_alpha = alpha
                best_weight = w
    print('Ridge Classifier: {}, {}'.format(best_alpha, best_f1))
    return linear_model.RidgeClassifier(alpha=best_alpha, random_state=0, class_weight={0:best_weight,1:1}), best_f1

def tune_sgd(train, test):
    best_f1 = 0
    best_alpha = 0.0001
    best_alpha = 1
    best_weight = 1
    for w in [0.3,0.5,0.7,0.9]:
        for alpha in np.linspace(0.0000001,0.01,100):
            lr = linear_model.SGDClassifier(alpha = alpha, random_state=0, class_weight={0:w,1:1})
            f1 = train_model(lr, train, train_y[col], test, probs=False)[0]
            if f1>best_f1:
                best_f1 = f1
                best_alpha = alpha
                best_weight = w
    print('SGD Classifier: {}, {}'.format(best_alpha, best_f1))
    return linear_model.SGDClassifier(alpha = best_alpha, random_state=0, class_weight={0:best_weight,1:1}), best_f1

def tune_svc(train, test):
    best_f1 = 0
    best_c = 1
    best_alpha = 1
    best_weight = 1
    for w in [0.3,0.5,0.7,0.9]:
        for c in np.linspace(0.5,30,40):
            # lr = svm.SVC(C=c, random_state=0, kernel = 'linear', gamma='auto', class_weight={0:0.9,1:1})
            lr = svm.LinearSVC(C=c, random_state=0, class_weight={0:w,1:1})
            f1 = train_model(lr, train, train_y[col], test, probs=False)[0]
            if f1>best_f1:
                best_f1 = f1
                best_c = c
                best_weight = w
    print('SVC: {}, {}'.format(best_c, best_f1))
    return svm.LinearSVC(C = best_c, random_state=0, class_weight={0:best_weight,1:1}), best_f1

#tune_svc(X_train_word_average, X_valid_word_average)

In [None]:
def model_selector(train, test):
    lr, lr_f1 = tune_lr(train, test)
    rc, rc_f1 = tune_rc(train, test)
    sgd, sgd_f1 = tune_sgd(train, test)
    svc, svc_f1 = tune_svc(train, test)
    max_f1 = max([lr_f1,rc_f1,sgd_f1,svc_f1])
    if max_f1 == lr_f1:
        print(lr)
        return lr
    elif max_f1 == rc_f1:
        print(rc)
        return rc
    elif max_f1 == sgd_f1:
        print(sgd)
        return sgd
    else:
        print(svc)
        return svc
    
def model_selector(train, test):
    lr, lr_f1 = tune_lr(train, test)
    rc, rc_f1 = tune_rc(train, test)
    sgd, sgd_f1 = tune_sgd(train, test)
    max_f1 = max([lr_f1,rc_f1,sgd_f1])
    if max_f1 == lr_f1:
        print(lr)
        return lr
    elif max_f1 == rc_f1:
        print(rc)
        return rc
    else:
        print(sgd)
        return sgd

In [None]:
def set_builder():
    sets = [(X_train_word_average, X_valid_word_average, 'word2vec', new_word_average),
           (train_topics, valid_topics, 'svd_components', new_topics),
           (xtrain_tfidf, xvalid_tfidf, 'word_tfidf', x_count),
           (xtrain_count, xvalid_count, 'word_count', x_tfidf),
           (train_x[cols[:-3]], valid_x[cols[:-3]], 'nlp_features', tweets_to_train[cols[:-3]]),
           (train_vectors_dbow, test_vectors_dbow, 'doc2vec', new_vectors_dbow)]
    
    valid = pd.DataFrame(index = valid_x.index.values, columns = [i[2] for i in sets])
    new = pd.DataFrame(index = tweets_to_train.index.values, columns = [i[2] for i in sets])
    
    for s in sets:
        classifier = model_selector(s[0],s[1])
        if type(classifier).__name__ == 'LogisticRegression':
            new[s[2]] =  train_model(classifier, s[0], train_y[col], s[3], probs=True, ret_probas=True, valid = False)
            valid[s[2]] =  train_model(classifier, s[0], train_y[col], s[1], probs=True, ret_probas=True)
        else:
            new[s[2]] =  train_model(classifier, s[0], train_y[col], s[3], probs=False, ret_preds=True, valid = False)
            valid[s[2]] =  train_model(classifier, s[0], train_y[col], s[1], probs=False, ret_preds=True)
            
    return new, valid

#%time train_preds, valid_preds = set_builder('Politics')

## Train models and make predictions for unseen tweets

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
col = 'Politics'
n, features = tune_lda()
train_topics, valid_topics, new_topics = get_lda_topics(tweets, n, features=features, purpose = 'gimme')
print(n,features)
%time new_preds_pol, valid_preds_pol= set_builder()

In [None]:
col = 'Health'
n, features = tune_lda()
train_topics, valid_topics, new_topics = get_lda_topics(tweets, n, features=features, purpose = 'gimme')
print(n,features)
%time new_preds_hl, valid_preds_hl = set_builder()

In [None]:
col = 'Economic'
n, features = tune_lda()
train_topics, valid_topics, new_topics = get_lda_topics(tweets, n, features=features, purpose = 'gimme')
print(n,features)
%time new_preds_ec, valid_preds_ec = set_builder()

In [None]:
col = 'Travel'
n, features = tune_lda()
train_topics, valid_topics, new_topics = get_lda_topics(tweets, n, features=features, purpose = 'gimme')
print(n,features)
%time new_preds_tr, valid_preds_tr = set_builder()

In [None]:
col = 'Personal'
n, features = tune_lda()
train_topics, valid_topics, new_topics = get_lda_topics(tweets, n, features=features, purpose = 'gimme')
print(n,features)
%time new_preds_per, valid_preds_per = set_builder()

In [None]:
valid_preds_ec['y'] = list(valid_y['Economic'])
valid_preds_per['y'] = list(valid_y['Personal'])
valid_preds_pol['y'] = list(valid_y['Politics'])
valid_preds_tr['y'] = list(valid_y['Travel'])
valid_preds_hl['y'] = list(valid_y['Health'])

## Aggregate predictions using a meta-classifier

In [None]:
meta = ensemble.BaggingClassifier()

#Politics
meta.fit(valid_preds_pol.drop('y', axis = 1), valid_preds_pol['y'])
tweets_to_train['Politics'] = meta.predict(new_preds_pol)

#Travel
meta.fit(valid_preds_tr.drop('y', axis = 1), valid_preds_tr['y'])
tweets_to_train['Travel'] = meta.predict(new_preds_tr)

#Economic
meta.fit(valid_preds_ec.drop('y', axis = 1), valid_preds_ec['y'])
tweets_to_train['Economic'] = meta.predict(new_preds_ec)

#Health
meta.fit(valid_preds_he.drop('y', axis = 1), valid_preds_he['y'])
tweets_to_train['Health'] = meta.predict(new_preds_he)

#Personal
meta.fit(valid_preds_per.drop('y', axis = 1), valid_preds_per['y'])
tweets_to_train['Personal'] = meta.predict(new_preds_per)

In [None]:
tweets_to_train.to_csv('Data/tweets_with_categories.csv')

## Active Learning

* Select a subset of 100 tweets from each category to be given for human annotation
* The process was repeated 4 times
* The file answers2.csv contains all the annotated tweets including the ones given for the initial annotation and the ones given during the active learning process

In [None]:
predictors = ['word2vec', 'svd_components', 'word_tfidf', 'word_count', 'nlp_features', 'doc2vec']

In [None]:
for c in predictors:
    new_preds_ec[c] = [1 if i>=0.5 else 0 for i in new_preds_ec[c]]
    new_preds_hl[c] = [1 if i>=0.5 else 0 for i in new_preds_hl[c]]
    new_preds_tr[c] = [1 if i>=0.5 else 0 for i in new_preds_tr[c]]
    new_preds_pol[c] = [1 if i>=0.5 else 0 for i in new_preds_pol[c]]
    new_preds_per[c] = [1 if i>=0.5 else 0 for i in new_preds_per[c]]

In [None]:
new_preds_ec['pred'] = new_preds_ec[predictors].sum(axis=1)
new_preds_pol['pred'] = new_preds_pol[predictors].sum(axis=1)
new_preds_hl['pred'] = new_preds_hl[predictors].sum(axis=1)
new_preds_per['pred'] = new_preds_per[predictors].sum(axis=1)
new_preds_tr['pred'] = new_preds_tr[predictors].sum(axis=1)

In [None]:
tweets_to_train['Politics'] = new_preds_pol['pred']
tweets_to_train['Health'] = new_preds_hl['pred']
tweets_to_train['Personal'] = new_preds_per['pred']
tweets_to_train['Travel'] = new_preds_tr['pred']
tweets_to_train['Economic'] = new_preds_ec['pred']

In [None]:
# tweets_to_train.dropna().to_csv('wtf.csv')
tweets_to_train.describe()

In [None]:
al_list = []

for i in categories:
    print(i)
    al_list += list(tweets_to_train[(tweets_to_train[i] == 3)].sample(100)['status_id'])

In [None]:
pd.DataFrame(al_list).to_csv('Data/tweets_for_active_learning.csv', index=False, header= False)