# Tweet Classification Modelling

## Model Plan

- Prepare the dataframe with columns: tweet_text, price_previous day, price_next day, price_diff
- Preprocess the tweet text into features (countVectorier, tfidf)
- Classification: LogisticRegression

### Query the dataframe with columns: tweet text, price previous day, price next day, price_diff

In [1]:
# Setting up libraries:

import pandas as pd
from sqlalchemy import create_engine, inspect 

from config import user, password, hostname

In [2]:
# Create engine
engine = create_engine(f'postgresql+psycopg2://{user}:{password}@{hostname}/twitter_vs_stocks')

In [3]:
#Retrieving dataset from database

tweets_price = pd.read_sql_query(
    """
        SELECT 
            tweets.text AS tweet_text,
            tweets.tokenized_text AS tweet_tokens,
            COALESCE(stock_prev.close, stock_prev_prev.close, stock_prev_prev_prev.close) AS prev_day_close,
            COALESCE(stock_next.close, stock_next_next.close, stock_next_next_next.close) AS next_day_close
        FROM tweets_text tweets
        LEFT JOIN stock stock_prev
            ON (tweets.date - INTERVAL '1 day') = stock_prev.date
        LEFT JOIN stock stock_prev_prev
            ON (tweets.date - INTERVAL '2 day') = stock_prev_prev.date
        LEFT JOIN stock stock_prev_prev_prev
            ON (tweets.date - INTERVAL '3 day') = stock_prev_prev_prev.date
        LEFT JOIN stock stock_next
            ON (tweets.date + INTERVAL '1 day') = stock_next.date
        LEFT JOIN stock stock_next_next
            ON (tweets.date + INTERVAL '2 day') = stock_next_next.date
        LEFT JOIN stock stock_next_next_next
            ON (tweets.date + INTERVAL '3 day') = stock_next_next_next.date
        WHERE tweets.date > '2011-01-01' AND tweets.tokenized_text != '{}'
        ORDER BY tweets.date
    """,
    con=engine
)

tweets_price.dropna(inplace=True)

#Computing difference between the stock price before the date of tweet and after the post. 
tweets_price['close_price_diff'] = tweets_price['next_day_close'] - tweets_price['prev_day_close']

In [4]:
tweets_price = tweets_price[tweets_price.tweet_tokens.str.count(',') > 1] # More than two words in tweet
tweets_price.head()

Unnamed: 0,tweet_text,tweet_tokens,prev_day_close,next_day_close,close_price_diff
0,{I made the volume on the Model S http://t.co...,"{made,volume,model,go,need,work,miniature,ston...",6.548,6.66,0.112
1,"{That was a total non sequitur btw, Great Volt...","{total,non,sequitur,great,voltaire,quote,argua...",6.66,6.884,0.224
2,{Am reading a great biography of Ben Franklin ...,"{reading,great,biography,ben,franklin,isaacson...",6.66,6.884,0.224
3,{Yum! Even better than deep fried butter: htt...,"{yum,even,better,deep,fried,butter,yeah,really...",5.58,5.554,-0.026
4,{Model S options are out! Performance in red a...,"{model,options,performance,red,black,deliver,c...",5.514,5.58,0.066


## Classification: Which tweets increase stock price vs decrease

In [5]:
# Setting up libraries for model

#CountVectorizer = this takes in a list and counts how many times it appears
#TfidfTransformer = frequency of word (less is better for results)

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

text_clf = Pipeline([
    ('vect', CountVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x)),
    ('tfidf', TfidfTransformer()),
    ('clf', LogisticRegression(C=0.01, random_state=1)),
])

In [6]:
# Setting up variables
X = tweets_price.tweet_tokens.tolist()
X_text = tweets_price.tweet_text.tolist()
y = (tweets_price['close_price_diff'] > 0).astype(int).values

In [7]:
# Separating data from training vs testing data
X_train, X_test, y_train, y_test, X_text_train, X_text_test = train_test_split(X, y, X_text, random_state=1)

In [8]:
# Classify text data
text_clf.fit(X_train, y_train)

Pipeline(steps=[('vect',
                 CountVectorizer(preprocessor=<function <lambda> at 0x7f9a537db430>,
                                 tokenizer=<function <lambda> at 0x7f9a55030c10>)),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(C=0.01, random_state=1))])

In [9]:
text_clf.score(X_test, y_test)

0.5736434108527132

In [10]:
# Testing predicted probability
predicted_proba_test = text_clf.predict_proba(X_test)[:, 1]

In [11]:
# Adding results into DataFrame
results_test = pd.DataFrame({
    'proba_positive_tweet': predicted_proba_test,
    'tweet_text': X_text_test,
}).sort_values('proba_positive_tweet', ascending=False)
pd.set_option('display.max_colwidth', None)
results_test.head(10)

Unnamed: 0,proba_positive_tweet,tweet_text
262,0.553118,"{Star Light, Star Bright https://t.co/6CeTAZSXCO}"
56,0.552274,"{Worth reading Life 3.0 by @Tegmark. AI will be the best or worst thing ever for humanity, so let’s get it right. https://t.co/lT0uMH3ujZ}"
103,0.55167,"{RT @Tesla: Configurator now live in Hungary 🇭🇺 &amp; Romania 🇷🇴, RT @SpaceX: SpaceX’s fifth high-altitude flight test of Starship from Starbase in Texas https://t.co/FnrXuHpsVj}"
86,0.551625,"{About 5 mins from flight attempt, Starhopper flight currently tracking to 5pm Texas time for 150m / ~500ft hover test}"
106,0.551546,{Over 1000 km should be possible in a 100D with the right tires https://t.co/8czN3dVZE4}
219,0.5515,{US govt testing by @NHTSAgov finds Model X to be the safest SUV in history by significant margin https://t.co/zAdb5FQPEI}
133,0.551442,{Regarding the meeting at the White House: https://t.co/8b1XH4oW6h}
199,0.551121,{Fourth rocket arrives in the hangar. Aiming for first reflight in Sept/Oct. https://t.co/TqW8d6Cc3U}
198,0.551069,{First test flight hop of our Grasshopper VTVL rocket! http://t.co/oomI5vSB}
276,0.550941,"{Great progress by Starship Cape team. Started several months behind, but catching up fast. This will be a super fun race to orbit, moon &amp; Mars!}"
