# Import Libraries

In [1]:
import numpy as np
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu
from spacy.lang.en.examples import sentences 
import spacy as sp
import nltk
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
%matplotlib inline
# import zipfile
from textblob import TextBlob
import string
import re 

In [2]:
tweets = pd.read_csv('data/TrumpTweetsCleaned.csv')
#09-11-2017 to 02-16-2018
before = pd.to_datetime('2017-09-11')
after = pd.to_datetime('2018-02-16')
tweets['created_at'] = pd.to_datetime(tweets['created_at'])

# Preprocessing (James)

In [3]:
#preprocessing functions for the dataframe
def delete_punct(text):
    '''
    removes special characters from the document
    '''
    no_punct = "".join([c for c in text if c not in string.punctuation])
    return no_punct 
def remove_stopwords(text):
    '''
    removes all stopwords according to the pre-built english dictionary of known stopwords
    '''
    words = [w for w in text if w not in set(stopwords.words('english'))]
    return words
def lemmatize(text):
    '''
    returns the lemmas of each word in the document
    '''
    lemmatized = [lemmatizer.lemmatize(i) for i in text]
    return lemmatized
def joiner(text):
    '''
    joins the comma-separated list into one string
    '''
    joined = " ".join([i for i in text])
    return joined
lemmatizer = WordNetLemmatizer()
tokenizer = TweetTokenizer()

In [4]:
def preproc_pipeline(tweets):
    '''
    The preprocessing pipline applied to each document in the dataframe
    '''
    tweets['preproc'] = tweets['text'].apply(lambda x: delete_punct(x))
    tweets['preproc'] = tweets['preproc'].apply(lambda x: tokenizer.tokenize(x.lower()))
    tweets['preproc'] = tweets['preproc'].apply(lambda x: remove_stopwords(x))
    tweets['preproc'] = tweets['preproc'].apply(lambda x: lemmatize(x))
    tweets['preproc'] = tweets['preproc'].apply(lambda x: joiner(x))
    return tweets

In [5]:
tweets = preproc_pipeline(tweets)

# TFIDF Term Extraction

In [6]:
# fitting sk-learn's TF-IDF Vectorizer to our dataframe and returning list of most important keywords
docs = tweets['preproc'].tolist()
tfidf = TfidfVectorizer(max_features = 2500, min_df = 10, max_df = .9)
tfidf2 = TfidfVectorizer(max_features = 150, min_df = 160, max_df = .9)
X = tfidf.fit_transform(docs).toarray()
feature_names = tfidf.get_feature_names()
X2 = tfidf2.fit_transform(docs).toarray()
features_names2 = tfidf2.get_feature_names()

In [7]:
def intersection(l1,l2):
    '''
    finding intersection of elements between two lists
    '''
    return list(set(l1) & set(l2))
def term_extract(df):
    '''
    Will return a column of important keywords in each document determined by TF-IDF 
    Sentiment Analysis via Textblob will only be run on these keywords 
    '''
    df['keywords'] = df['preproc'].apply(lambda x: intersection(x.split(' '), feature_names))
    df['keywords'] = df['keywords'].apply(lambda x: joiner(x))
    return df
tweets = term_extract(tweets)

In [8]:
tweets['polarity'] = [TextBlob(tweets['keywords'].values[i]).polarity for i in range(len(tweets))]
tweets['subjectivity'] = [TextBlob(tweets['keywords'].values[i]).subjectivity for i in range(len(tweets))]

In [9]:
# Delete columns 
# tweets  = tweets.drop(columns = ['text', 'preproc'])
tweets = tweets.replace('', np.nan)
tweets = tweets.dropna()

# Word Similarity

In [11]:
# tweets_stock_viable = pd.read_csv('data/tweets_stock_viable_5')
# tweets_stock_viable['keywords'] = tweets_stock_viable['keywords'].astype(str)

In [11]:
from gensim.models import word2vec

# tokenize sentences in corpus
wpt = nltk.WordPunctTokenizer()
tokenized_corpus = [wpt.tokenize(document) for document in tweets['keywords']]

# Set values for various parameters
feature_size = 100    # Word vector dimensionality  
window_context = 30          # Context window size                                                                                    
min_word_count = 1   # Minimum word count                        
sample = 1e-3   # Downsample setting for frequent words

w2v_model = word2vec.Word2Vec(tokenized_corpus, size=feature_size, 
                          window=window_context, min_count=min_word_count,
                          sample=sample, iter=50)

In [12]:
def word_similarity(df):
    columns = []
    for keyword in features_names2:
        icols = []
        for i in df:
            temp = []
            list_ = i.split()
            for j in list_:
                if j in w2v_model.wv.vocab:
                    temp.append(w2v_model.wv.similarity(j, 'china'))
            icols.append(max(temp))
        columns.append(icols)
    return columns
columns = word_similarity(tweets['keywords'])

In [13]:
word_similarity = pd.DataFrame(columns).T
word_similarity.columns = features_names2
word_similarity.index = tweets.index

In [26]:
preprocessed = pd.concat([tweets, word_similarity], axis = 1).reset_index(drop = True)

## Stock Data 

In [28]:
stock = pd.read_csv('data/SP500_intraday.csv')

In [29]:
stock['Time_pd'] = pd.to_timedelta(stock['Time']+':00')
stock['Date_time'] = pd.to_datetime(pd.to_datetime(stock['Date'])+ pd.to_timedelta(stock['Time_pd']))

In [30]:
latest_time = stock['Date_time'].iloc[-1]
earliest_time = stock['Date_time'][0] 
temp = preprocessed[(preprocessed['created_at'] > earliest_time) & (preprocessed['created_at'] < latest_time)]
temp = temp.reset_index(drop = True)

In [31]:
temp

Unnamed: 0,text,created_at,retweet_count,favorite_count,is_retweet,preproc,keywords,polarity,subjectivity,administration,...,witch,work,working,world,would,year,äôs,äôt,äù,üá
0,The debates especially the second and third pl...,2016-11-13 18:46:00,23410.0,113207.0,False,debate especially second third plus speech int...,especially win supporter gave speech third lar...,0.402857,0.395714,0.135235,...,0.135235,0.135235,0.135235,0.135235,0.135235,0.135235,0.135235,0.135235,0.135235,0.135235
1,If the election were based on total popular vo...,2016-11-15 13:34:00,49623.0,179219.0,False,election based total popular vote would campai...,total even election based campaigned popular c...,0.258333,0.745833,0.119161,...,0.119161,0.119161,0.119161,0.119161,0.119161,0.119161,0.119161,0.119161,0.119161,0.119161
2,The Electoral College is actually genius in th...,2016-11-15 13:40:00,39125.0,128083.0,False,electoral college actually genius brings state...,different state one college actually including...,0.066667,0.300000,0.143061,...,0.143061,0.143061,0.143061,0.143061,0.143061,0.143061,0.143061,0.143061,0.143061,0.143061
3,Very organized process taking place as I decid...,2016-11-16 02:55:00,27544.0,119611.0,False,organized process taking place decide cabinet ...,one taking place process many cabinet position...,0.500000,0.500000,0.252220,...,0.252220,0.252220,0.252220,0.252220,0.252220,0.252220,0.252220,0.252220,0.252220,0.252220
4,"I am not trying to get ""top level security cle...",2016-11-16 11:28:00,31102.0,99921.0,False,trying get top level security clearance child ...,trying false top news level get child security...,0.050000,0.550000,0.265493,...,0.265493,0.265493,0.265493,0.265493,0.265493,0.265493,0.265493,0.265493,0.265493,0.265493
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9369,The Amazon Washington Post and three lowlife r...,2019-11-07 15:27:00,24007.0,83333.0,False,amazon washington post three lowlife reporter ...,barr bill source matt another amazon post stor...,-0.500000,1.000000,0.203453,...,0.203453,0.203453,0.203453,0.203453,0.203453,0.203453,0.203453,0.203453,0.203453,0.203453
9370,The Radical Left Dems and LameStream Media are...,2019-11-07 15:41:00,18328.0,72619.0,False,radical left dems lamestream medium trying mak...,hoax impeachment trying dems republican lamest...,0.161174,0.349053,0.196683,...,0.196683,0.196683,0.196683,0.196683,0.196683,0.196683,0.196683,0.196683,0.196683,0.196683
9371,Stock Market up big today. A New Record. Enjoy!,2019-11-07 15:43:00,20211.0,114187.0,False,stock market big today new record enjoy,market stock record today big new enjoy,0.178788,0.351515,0.099571,...,0.099571,0.099571,0.099571,0.099571,0.099571,0.099571,0.099571,0.099571,0.099571,0.099571
9372,STATEMENT FROM PRESIDENT DONALD J. TRUMP https...,2019-11-08 00:08:00,31818.0,110993.0,False,statement president donald j trump httpstcoekt...,statement donald president trump,0.000000,0.000000,0.075596,...,0.075596,0.075596,0.075596,0.075596,0.075596,0.075596,0.075596,0.075596,0.075596,0.075596


In [32]:
## Edit the minute after
time_min = 5
time_after = temp['created_at'] + pd.to_timedelta(time_min, unit = 'm')
time_of = temp['created_at']
good_time_after = sorted(list(set(stock['Date_time']) & set(list(time_after))))
x = np.arange(0,len(good_time_after))
tweets_stock_viable = temp.iloc[np.concatenate([np.where(time_after == good_time_after[x])[0] for x in x])]

time_after2 = tweets_stock_viable['created_at'] + pd.to_timedelta(time_min, unit = 'm')
time_of2 = tweets_stock_viable['created_at']
x = time_of2

bad_var = [stock[stock['Date_time'] == time_of2[i]]['Open'].values
             for i in tweets_stock_viable.index]
bad = np.where(pd.DataFrame(bad_var, index = tweets_stock_viable.index).isna())[0]
tweets_stock_viable = tweets_stock_viable.drop(tweets_stock_viable.index[bad])
stock_df = [stock[stock['Date_time'] == time_after2[i]]['Close'].values - stock[stock['Date_time'] == time_of2[i]]['Open'].values
             for i in x.index]

In [33]:
tweets_stock_viable['stock_dif'] = pd.DataFrame(np.concatenate(stock_df), index= tweets_stock_viable.index)
length = len(tweets_stock_viable)
tweets_stock_viable['stock_up'] = [1 if tweets_stock_viable['stock_dif'][i] > 0 else 0 for i in tweets_stock_viable.index]

In [36]:
tweets_stock_viable = tweets_stock_viable.dropna()

In [37]:
tweets_stock_viable.to_csv('data/preprocessed')

In [None]:
# https://stackoverflow.com/questions/43776572/visualise-word2vec-generated-from-gensim
vocab = list(word_similarity) + ['xi', 'farm', 'farmer', 'deficit', 'intellectual', 'property']
X = w2v_model[vocab]

tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X)

w2v_df = pd.DataFrame(X_tsne, index=vocab, columns=['x', 'y'])


fig = plt.figure(figsize=(20, 10))
ax = fig.add_subplot(1, 1, 1)

ax.scatter(w2v_df['x'], w2v_df['y'])
ax.set_title('Dimension Reduced 2D Visualization of Word Similarities from Gensim Word2Vec Model')

for word, pos in w2v_df.iterrows():
    ax.annotate(word, pos)