In [None]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas as pd
import numpy as np
import xgboost, textblob, string, ekphrasis, nltk, re

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

from numpy import dot
from numpy.linalg import norm

from keras.preprocessing import text, sequence
from keras import layers, models, optimizers
from xgboost.sklearn import XGBClassifier

from sklearn.svm import NuSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics.pairwise import cosine_similarity

from ekphrasis.classes.spellcorrect import SpellCorrector
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

from nltk.stem import WordNetLemmatizer

import gensim
from gensim.models import Word2Vec

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
from gensim.models.doc2vec import TaggedDocument

from datetime import datetime, timedelta

sp = SpellCorrector(corpus="english") 

In [None]:
tweets = pd.read_csv('Data/raw_tweets_sample.csv', index_col=0)
tweets['datetime'] = pd.to_datetime(tweets['created_at'])
tweets = tweets.set_index('datetime')
tweets = tweets.sort_index()
tweets.drop(['created_at', 'status_id'], axis=1, inplace=True)

### Pre-processing

In [None]:
text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 
        'time', 'date', 'number'],
    
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=True,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

In [None]:
tweets['corrected_text'] = [" ".join(text_processor.pre_process_doc(s)) for s in tweets.text]

In [None]:
my_stopwords = nltk.corpus.stopwords.words('english')
word_rooter = nltk.stem.snowball.PorterStemmer(ignore_stopwords=False).stem #stemmer
#word_rooter = nltk.stem.WordNetLemmatizer().lemmatize #lemmatizer
my_punctuation = '!"$%&\'()*+,-./:;=?[\\]^_`{|}~•'

def clean_tweet(tweet, bigrams=False):
    #tweet = remove_users(tweet)
    #tweet = remove_links(tweet)
    tweet = tweet.lower() # lower case
    tweet = re.sub('['+my_punctuation + ']+', ' ', tweet) # strip punctuation
    tweet = re.sub('\s+', ' ', tweet) #remove double spacing
    tweet = re.sub('([0-9]+)', '', tweet) # remove numbers
    tweet_token_list = [word for word in tweet.split(' ') if word not in my_stopwords] # remove stopwords
    tweet_token_list = [word_rooter(word) if '#' not in word else word for word in tweet_token_list] # apply word rooter
    if bigrams:
        tweet_token_list = tweet_token_list+[tweet_token_list[i]+'_'+tweet_token_list[i+1]
                                            for i in range(len(tweet_token_list)-1)]
    tweet = ' '.join(tweet_token_list)
    return tweet

tweets['corrected_text'] = tweets['corrected_text'].apply(clean_tweet)

In [None]:
def label_sentences(corpus, label_type):
    """
    Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it.
    We do this by using the TaggedDocument method. The format will be "TRAIN_i" or "TEST_i" where "i" is
    a dummy index of the post.
    """
    labeled = []
    for i, v in enumerate(corpus):
        label = label_type + '_' + str(i)
        labeled.append(TaggedDocument(v.split(), [label]))
    return labeled

all_data = label_sentences(tweets.corrected_text, 'Full')

tweets['tags'] = [i[1][0] for i in all_data]

In [None]:
def get_vectors(model, corpus_size, vectors_size, vectors_type):
    vectors = np.zeros((corpus_size, vectors_size))
    for i in range(0, corpus_size):
        prefix = vectors_type + '_' + str(i)
        vectors[i] = model.docvecs[prefix]
    return vectors

model_dbow = Doc2Vec(dm=0, vector_size=300, window=5, negative=5, min_count=10, alpha=0.065, min_alpha=0.065)
model_dbow.build_vocab([x for x in tqdm(all_data)])

for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(all_data)]), total_examples=len(all_data), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

tweets_vectors = get_vectors(model_dbow, len(all_data), 300, 'Full')

## Calculate  novelty

In [None]:
def similarity(a, b):
    t1 = model_dbow.docvecs[a]
    t2 = model_dbow.docvecs[b]
    return dot(t1, t2)/(norm(t1)*norm(t2))

In [None]:
def get_period(t, days):
    to = str(tweets[tweets.tags == t].index[0])
    f = str(tweets[tweets.tags == t].index[0] - timedelta(days = days))
    return to, f

In [None]:
def get_similarities(t, days):
    to, f = get_period(t, days)
    temp = tweets[f:to]
    temp = temp[temp.tags!=t]
    
    l = [similarity(t,i) for i in temp.tags]
    
    return np.mean(l)

In [None]:
import random
import threading


def run_all(sample, nov, nov_3, nov_7):
    all_data = label_sentences(sample.corrected_text, 'Full')
    sample['tags'] = [i[1][0] for i in all_data]
    
    model_dbow = Doc2Vec(dm=0, vector_size=300, window=5, negative=5, min_count=10, alpha=0.065, min_alpha=0.065)
    model_dbow.build_vocab([x for x in tqdm(all_data)])
    
    for epoch in range(30):
        model_dbow.train(utils.shuffle([x for x in tqdm(all_data)]), total_examples=len(all_data), epochs=1)
        model_dbow.alpha -= 0.002
        model_dbow.min_alpha = model_dbow.alpha
    
    for k, i in enumerate(sample.tags):
        if k%50000 == 0:
            print(datetime.now(), k)
        nov.append(get_similarities(i, 1))
        nov_3.append(get_similarities(i, 3))
        nov_7.append(get_similarities(i, 7))
    

threads = 8
jobs = []
for i, df in zip(range(0, threads), np.array_split(tweets, threads)):
    nov = list()
    nov_3 = list()
    nov_7 = list()
    thread = threading.Thread(target=run_all(df, nov, nov_3, nov_7))
    jobs.append(thread)

for j in jobs:
    j.start()

for j in jobs:
    j.join()

print("List processing complete.")

In [None]:
tweets['1day_similarity'] = nov
tweets['3day_similarity'] = nov_3
tweets['7day_similarity'] = nov_7

tweets.to_csv('tweets_with_novelty.csv')