# Quantifying Novelty

The uniqueness of information contained in a tweet is calculated by comparing a tweet with the tweets posted in the previous days. The task starts with text pre-processing and cleaning. The tweets are summarized into vectors using the Doc2Vec algorithm. Finally, cosine similarity is used to calculate the novelty.

In [None]:
import pandas as pd
import numpy as np
import xgboost, textblob, string, ekphrasis, nltk, re

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

from numpy import dot
from numpy.linalg import norm

from ekphrasis.classes.spellcorrect import SpellCorrector
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

from nltk.stem import WordNetLemmatizer

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
from gensim.models.doc2vec import TaggedDocument

from datetime import datetime, timedelta

sp = SpellCorrector(corpus="english") 

In [None]:
tweets = pd.read_csv('tweets.csv', index_col=0)
tweets['datetime'] = pd.to_datetime(tweets['created_at'])
tweets = tweets.set_index('datetime')
tweets.drop(['created_at', 'status_id'], axis=1, inplace=True)

### Pre-processing
1. Text pre-processing (spelling correction, unpack hashtags, tokenization)
2. Stop-word and punctuation removal
3. Word stemming

In [None]:
text_processor = TextPreProcessor(
    normalize=['url', 'email', 'percent', 'money', 'phone', 'time', 'date', 'number'],
    fix_html=True,  
    segmenter="twitter", 
    corrector="twitter", 
    unpack_hashtags=True,  
    unpack_contractions=True, 
    spell_correct_elong=True,
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    dicts=[emoticons]
)

In [None]:
stopwords = nltk.corpus.stopwords.words('english')
rooter = nltk.stem.snowball.PorterStemmer(ignore_stopwords=False).stem 
punctuation = '!"$%&\'()*+,-./:;=?[\\]^_`{|}~•'

def clean_tweet(tweet):
    tweet = tweet.lower()
    tweet = re.sub('['+punctuation + ']+', ' ', tweet)
    tokens = [word for word in tweet.split(' ') if word not in stopwords]
    tokens = [word_rooter(word) if '#' not in word else word for word in tokens]
    if bigrams:
        tweet_token_list = tweet_token_list+[tweet_token_list[i]+'_'+tweet_token_list[i+1]
                                            for i in range(len(tweet_token_list)-1)]
    tweet = ' '.join(tokens)
    return tweet

In [None]:
tweets['corrected_text'] = [" ".join(text_processor.pre_process_doc(s)) for s in tweets.text]
tweets['corrected_text'] = tweets['corrected_text'].apply(clean_tweet)

## Doc2Vec
1. Assign unique labels to tweets
2. Initialize and train model
3. Get tweet vectors

In [None]:
def get_vectors(model, corpus_size, vectors_size, label):
    """
    model: Doc2Vec trained model object
    coprus_size: Size of corpus
    vectors_size: Size of vectors
    label: Label prefix used in labeling the corpus
    """
    vectors = np.zeros((corpus_size, vectors_size))
    for i in range(0, corpus_size):
        prefix = label + '_' + str(i)
        vectors[i] = model.docvecs[prefix]
    return vectors

In [None]:
def label_tweets(tweets, label):
    """
    tweets: Tweets corpus
    label: Label prefix to be used
    """
    labeled = []
    for i, v in enumerate(tweets):
        temp = label + '_' + str(i)
        labeled.append(TaggedDocument(v.split(), [temp]))
    return labeled

labeled = label_sentences(tweets.corrected_text, 'Full')

tweets['tags'] = [i[1][0] for i in labeled]

In [None]:
d2v = Doc2Vec(dm=0, vector_size=300, window=5, negative=5, min_count=10, alpha=0.065, min_alpha=0.065)
d2v.build_vocab([x for x in tqdm(all_data)])

for epoch in range(30):
    d2v.train(utils.shuffle([x for x in tqdm(all_data)]), total_examples=len(all_data), epochs=1)
    d2v.alpha -= 0.002
    d2v.min_alpha = d2v.alpha

tweets_vectors = get_vectors(d2v, len(all_data), 300, 'Full')

## Calculate  novelty

In [None]:
#Cosine similarity
def similarity(a, b):
    """
    t1: First tweet's unique tag
    t2: Second tweet's unique tag
    """
    t1 = model_dbow.docvecs[a] #Vector of first tweet
    t2 = model_dbow.docvecs[b] #Vector of second tweet
    return dot(t1, t2)/(norm(t1)*norm(t2)) #Cosine similarity

In [None]:
#Get period range for tweets to compare
def get_period(t, days):
    """
    t: Tweet's unique tag
    days: Days before the tweet to compare
    """
    to = str(tweets[tweets.tags == t].index[0]) #date of tweet
    f = str(tweets[tweets.tags == t].index[0] - timedelta(days = days)) 
    return to, f

In [None]:
def get_similarities(t, days, sample):
    """
    t: Tweet's unique tag
    days: Days before the tweet to compare
    sample: Sample of tweets to compare
    """
    to, f = get_period(t, days)
    temp = tweets[f:to] # Subset tweets by datetime index
    if temp.shape[0]>sample: #Take random sample
        temp = temp.sample(sample)
    temp = temp[temp.tags!=t] #Remove examining tweet from corpus
    
    l = [similarity(t,i) for i in temp.tags] #Get similarity vector
    
    return np.mean(l) #Return average similarity

In [None]:
tweets['1day_similarity'] = [get_similarities(i, 1, 1000) for i in tweets.tags]
tweets['2day_similarity'] = [get_similarities(i, 2, 1000) for i in tweets.tags]
tweets['3day_similarity'] = [get_similarities(i, 3, 1000) for i in tweets.tags]