# Machine Learning Model - First Segment Project Deliverable

## Model Plan

1. Prepare the dataframe with columns: tweet text, price previous day, price next day, price_diff
2. Preprocess the tweet text into features (countVectorier, tfidf)
    1. Classification: predict if it goes up or down (Binomial Naive Bayes)
    2. Regress the actual price difference (RandomForests, if time allows XGBoost or lightgbm)
3. Evaluate algorithms and discuss results

### 1. Prepare the dataframe with columns: tweet text, price previous day, price next day, price_diff

In [1]:
# Set-up Imports:

import requests
import pandas as pd

In [2]:
# Import data
tweets = pd.read_csv('https://raw.githubusercontent.com/angkohtenko/twitter_vs_stocks/kimberly_branch/Data/elon_tweets.csv')
tweets = tweets[['date', 'text']]

# import API key for financialmodelingprep.com
from config import API_key_stocks

url = 'https://financialmodelingprep.com/api/v3/historical-price-full/TSLA?serietype=line&apikey='+ API_key_stocks

# Get data from API for Tesla stocks and reformat it
tesla = requests.get(url).json()
tesla_df = pd.DataFrame.from_dict(data=tesla['historical'])
tesla_df['date'] = pd.to_datetime(tesla_df.date)
tesla_df = tesla_df.set_index('date').resample('D').ffill().reset_index()

In [3]:
# Reformate Date abndn time types
from datetime import timedelta

tweets['prev_date'] = pd.to_datetime(tweets.date) - timedelta(days=1)
tweets['next_date'] = pd.to_datetime(tweets.date) + timedelta(days=1)

In [4]:
# Check for NaN
tweets.dropna().shape

(849, 4)

In [5]:
# Merge Dataframes 
tweets_price = pd.merge(tweets, tesla_df, how='left', left_on='prev_date', right_on='date', suffixes=('', '_prev'))
tweets_price = pd.merge(tweets_price, tesla_df, how='left', left_on='next_date', right_on='date', suffixes=('', '_next'))

In [6]:
# Rename Columns 
tweets_price = tweets_price.rename(columns={'close': 'close_prev'})
tweets_price = tweets_price[['date', 'text', 'close_prev', 'close_next']]
tweets_price['close_price_diff'] = tweets_price['close_next'] - tweets_price['close_prev']
tweets_price.dropna(inplace=True)
tweets_price.shape

(845, 5)

## A - Classification: Which tweets increase stock price vs decrease
    -Preprocess the tweet text into features (countVectorier, tfidf)
    -Classification: predict if it goes up or down (Binomial Naive Bayes)

In [7]:
# Setting up libraries for model

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [8]:
# Setting up variables
X = tweets_price.text.tolist()
y = (tweets_price['close_price_diff'] > 0).astype(int).values

In [9]:
# Separating data from training vs testing data
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [10]:
# Classify text data
text_clf.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())])

In [11]:
# Testing predicted probability
predicted_proba_test = text_clf.predict_proba(X_test)[:, 1]

In [12]:
# Adding results into DataFrame
results_test = pd.DataFrame({
    'proba_positive_tweet': predicted_proba_test,
    'tweet': X_test,
    'label': y_test
}).sort_values('proba_positive_tweet', ascending=False)
pd.set_option('display.max_colwidth', None)
results_test.head()

Unnamed: 0,proba_positive_tweet,tweet,label
143,0.848459,RT @SpaceX: Falcon 9’s first stage has landed on the Of Course I Still Love You droneship https://t.co/szO3thMxqa,0
45,0.823316,RT @SpaceX: Falcon 9’s first stage lands on the Of Course I Still Love You droneship https://t.co/hd0IBPX3T5,0
122,0.821067,RT @SpaceX: Falcon 9’s first stage has landed on the Of Course I Still Love You droneship – the 9th landing of this booster https://t.co/wz…,0
14,0.699693,RT @SpaceX: Falcon 9 lands on Just Read the Instructions after delivering GPS III Space Vehicle 03 to orbit for the @SpaceForceDoD https://…,0
142,0.690645,RT @SpaceX: Falcon 9’s first stage has landed on Landing Zone 1! https://t.co/26M9Ptomg7,0


## B. Regression - Predict the stock price difference

In [13]:
# Setting up libraries for regression model

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
text_reg = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('reg', RandomForestRegressor()),
])

In [14]:
# Setting up variables
X = tweets_price.text.tolist()
y = tweets_price['close_price_diff']

In [15]:
# Separating data from training vs testing data
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [16]:
# Training
text_reg.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('reg', RandomForestRegressor())])

In [17]:
# Sklearn - Determine accuracy
text_reg.score(X_test, y_test)

-0.05211284698575924

## Multiclass classification: Positive, Neutral or Negative

In [18]:
# Setting up libraries for multiclass classification model of text data

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [19]:
#Setting up variables for classification
X = tweets_price.text.tolist()
y = pd.qcut(tweets_price['close_price_diff'], 4)
print(y.dtype.categories.tolist())
y = pd.qcut(tweets_price['close_price_diff'], 4, labels=False)

[Interval(-88.111, -7.95, closed='right'), Interval(-7.95, 0.0, closed='right'), Interval(0.0, 12.3, closed='right'), Interval(12.3, 124.04, closed='right')]


In [20]:
# Separating data from training vs testing data
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [21]:
# Training to get accuracy of guess
text_clf.fit(X_train, y_train)
text_clf.score(X_test, y_test)

0.33962264150943394

## Setting up Clusters for Analysis - kmeans

In [22]:
# Setting up imports for kmeans clusters


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [23]:
#Setting up variables for clusters
X = tweets_price.text.tolist()
km = KMeans(n_clusters=3)
km.fit(tweets_price['close_price_diff'].values.reshape(-1,1))
y = km.predict(tweets_price['close_price_diff'].values.reshape(-1,1))

In [24]:
# Separating data from training vs testing data
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [25]:
# Training to get accuracy of guess
text_clf.fit(X_train, y_train)
text_clf.score(X_test, y_test)

0.7264150943396226

In [26]:
# Checking value counts for labels (0: neutral 1: positive 2: negative effect on stocks)
pd.Series(y).value_counts()

0    596
1    163
2     86
dtype: int64

In [27]:
# Viewing Cluster centers
km.cluster_centers_

array([[ -0.84726102],
       [ 40.05287325],
       [-51.20509231]])