In [None]:
#[Joe] 1. Acquire and process stock data
#[Alejandra]x 2. Acquire and process sentiment analysis data
#[Session] 3. Run baseline
#[Joe]x 4. Run sentiment analysis with LinearDiscreminateAnalysis
#[Edward] 5. Run sentiment analysis with alternate classifier
#[Edward] 6. Combine results
#[Session] 7. Put together PPT preso

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from textblob import TextBlob
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import yfinance as yf
import warnings
from pandas.tseries.offsets import DateOffset
from sklearn import svm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# Ignore all warnings within this code block
warnings.filterwarnings("ignore")

In [None]:
## Acquire and process sentiment data from Kaggle
sentiment_df=pd.read_csv(Path("../Resources/combined_csv.csv"))
sentiment_df.head()

In [None]:
# obtain polarity and subjectivity scores (potentially factor volume of information in the sentiment analysis)

# create a function to calculate the subjectivity
def calculate_subjectivity(headlines):
    return TextBlob(headlines).sentiment.subjectivity

# create a function to calculate the subjectivity
def calculate_polarity(headlines):
    return TextBlob(headlines).sentiment.polarity


In [None]:
# create two new columns "Subjectivity" and "Polarity"
sentiment_df["Subjectivity"] = sentiment_df["Headline"].apply(calculate_subjectivity)
sentiment_df["Polarity"] = sentiment_df["Headline"].apply(calculate_polarity)

In [None]:
# create function to get the sentiment scores 
def get_scores(headlines):
    get_score= SentimentIntensityAnalyzer()
    sentiment=get_score.polarity_scores(headlines)
    return sentiment

In [None]:
# get daily sentiment scores
compound = []
neg = []
pos = []
neu = []
score = 0

for x in range(0, len(sentiment_df["Headline"])):
    score = get_scores(sentiment_df["Headline"][x])
    compound.append(score["compound"])
    neg.append(score["neg"])
    neu.append(score["neu"])
    pos.append(score["pos"])
    

In [None]:
# Add Column with sentiment scores
sentiment_df["compound"]= compound
sentiment_df["neg"]= neg
sentiment_df["pos"]= pos
sentiment_df["neu"]= neu

#Display Dataframe
sentiment_df.tail()

In [None]:
# Calculate daily returns 
sentiment_df["daily returns"]= sentiment_df["TSLA Close"].pct_change()
sentiment_df.drop(columns=["SP500 Close", "Volume"], inplace = True)
sentiment_df= sentiment_df.set_index("Time")
sentiment_df.index= pd.to_datetime(sentiment_df.index)

In [None]:
# create label column add 1 when daily returns is positive and 0 when it is negative
sentiment_df["label"]= 0
sentiment_df.loc[(sentiment_df["daily returns"]> 0), 'label'] = 1

# verify that label is int
sentiment_df["label"].dtype

In [None]:
# calculate SMA short and SMA slow
sentiment_df["sma_short"]= sentiment_df.rolling(7)["TSLA Close"].mean()
sentiment_df["sma_long"]= sentiment_df.rolling(30)["TSLA Close"].mean()
sentiment_df.dropna(inplace=True)

In [None]:
# display dataframe
sentiment_df.columns

In [None]:
# create features variable with columns for X
features=['Volume.1', 'Subjectivity', 'Polarity',
       'compound', 'neg', 'pos', 'neu', 'sma_short',
       'sma_long']

X = sentiment_df[features].shift().dropna().copy()
X.tail()

In [None]:
# assign label to y 
# y = sentiment_df["label"][1:]

In [None]:
y=(sentiment_df['daily returns']>0).astype(int)[1:]

In [None]:
# verify x and y are same length

display(len(X))
display(len(y))

In [None]:
# Select the start of the training period
training_begin = X.index.min()

# Display the training begin date
print(training_begin)

# Select the ending period for the training data with an offset of 3 months
training_end = X.index.min() + DateOffset(months=2)

# Display the training end date
print(training_end)

# Generate the X_train and y_train DataFrames
X_train = X.loc[training_begin:training_end]
y_train = y.loc[training_begin:training_end]

# Review the X_train DataFrame
X_train.head()

# Generate the X_test and y_test DataFrames
X_test = X.loc[training_end+DateOffset(hours=1):]
y_test = y.loc[training_end+DateOffset(hours=1):]

# Review the X_test DataFrame
X_train.head()

In [157]:
## Scale the features DataFrames

# Create a StandardScaler instance
scaler = StandardScaler()

# Apply the scaler model to fit the X-train data
X_scaler = scaler.fit(X_train)

# Transform the X_train and X_test DataFrames using the X_scaler
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [158]:
## Use a classifier to predict future results

# From SVM, instantiate SVC classifier model instance
svm_model = svm.SVC()
 
# Fit the model to the data using the training data
svm_model = svm_model.fit(X_train_scaled, y_train)
 
# Use the testing data to make the model predictions
svm_pred = svm_model.predict(X_test_scaled)

# Review the model's predicted values
svm_pred[:10]

# Use a classification report to evaluate the model using the predictions and testing data
svm_testing_report = classification_report(y_test, svm_pred)

# Print the classification report
print(svm_testing_report)


              precision    recall  f1-score   support

           0       0.48      1.00      0.65       278
           1       0.00      0.00      0.00       300

    accuracy                           0.48       578
   macro avg       0.24      0.50      0.32       578
weighted avg       0.23      0.48      0.31       578



In [None]:
sentiment_df["TSLA Close"].plot()

In [None]:
y_test.value_counts()

In [None]:
# Create a predictions DataFrame
predictions_sentiment_df = pd.DataFrame(index=X_test.index)

# Add the SVM model predictions to the DataFrame
predictions_sentiment_df['Predicted'] = svm_pred

# Add the actual returns to the DataFrame
predictions_sentiment_df['Actual Returns'] = sentiment_df['daily returns']

# Add the strategy returns to the DataFrame
predictions_sentiment_df['Strategy Returns'] = predictions_sentiment_df['Actual Returns'] * predictions_sentiment_df['Predicted'].shift()

# Review the DataFrame
display(predictions_sentiment_df.head())
display(predictions_sentiment_df.tail())

In [160]:
model = LinearDiscriminantAnalysis()

model.fit(X_train_scaled, y_train)

In [162]:
predictions = model.predict(X_test_scaled)
predictions

array([1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [164]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.50      0.72      0.59       278
           1       0.57      0.35      0.43       300

    accuracy                           0.52       578
   macro avg       0.54      0.53      0.51       578
weighted avg       0.54      0.52      0.51       578



In [None]:
## Plot cummulative returns for the baseline model

# Plot the actual returns versus the strategy returns
baseline_plot = (1 + predictions_df[["Actual Returns", "Strategy Returns"]]).cumprod()
baseline_plot.plot()

# Save the baseline plot to a file
