# Homework 7 - Stock and Google Search Correlation Analysis 2
## Group 1
## 20 July 2021

### Introduction

Code imports and performs analysis on daily stock price and interest over time using machine learning
* GameStop (GME) <br>
* Apple (AAPL) <br>
* Coke (KO)<br>
* John Deere (DE) <br>
* AMC (AMC) <br>


### Import

In [501]:
import yfinance as yf
import pandas as pd
import numpy as np
import os
import matplotlib as mp
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from datetime import date

from datetime import timedelta
from pytrends.request import TrendReq

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, recall_score, precision_score, fbeta_score, classification_report
import random as rnd

from sklearn.cluster import KMeans                      # k-means clustering
from sklearn.model_selection import train_test_split    # For generating test/train
from sklearn.linear_model import LinearRegression   # Logistic regression
from sklearn.neighbors import KNeighborsClassifier

import math
from sklearn.metrics import explained_variance_score, mean_absolute_error, r2_score, mean_squared_error

from sklearn.linear_model import LogisticRegression
%matplotlib inline

### Global Varables and Initialization

In [502]:
dataDir = r"./Data Files/"  #Directory of all data

today = date.today()  # Todays date

rnd.seed(1024)

### Global Functions

In [503]:
# Function gets stock data and trend data if needed

# Function gets stock data and trend data if needed

def get_data(ticker):
    if os.path.exists(f"{dataDir}{ticker}_{today}_year.csv"):

        #Get stored data
        stored_data = pd.read_csv(f"{dataDir}{ticker}_{today}_year.csv")

        # Get rid of index name
        stored_data.set_index('Unnamed: 0', inplace=True)
        stored_data.index.name = None

        return stored_data
    else:
        #Get new data

        # Connect to Google API
        pytrends = TrendReq(hl='en-US', tz=360)

        # Set Keyword
        kw_list = [ticker]

        # Google API only shows last 90 days so need to intirate
        # Set start of interval
        date90front = date.today()
        # Initiate dataframe
        trend_data = pd.DataFrame()

        for x in range(4):
            # Set start end of interval
            date90back = date90front - timedelta(days=90)
            # Build Payload of 90 days
            pytrends.build_payload(kw_list,
                                   timeframe=f'{date90back} {date90front}',
                                   geo='')
            trend_90 = pytrends.interest_over_time()
            trend_data = pd.concat([trend_90, trend_data])
            date90front = date90back

        # Get Stock Data
        stock_data = yf.download(ticker,
                                 start=date.today() - timedelta(days=360),
                                 end=date.today(), interval="1d")

        # Combine Data
        new_data = stock_data.join(trend_data)

        # Export to data folder
        new_data.to_csv(f"{dataDir}{ticker}_{today}_year.csv")

        return new_data
    

### Data and Analysis

#### Gamestop(GME)
Connor Moore

##### Get Data

In [504]:
GME_DF = get_data("GME")
GME_DF

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,GME,isPartial
2020-07-27,4.020000,4.120000,3.950000,4.010000,4.010000,2472700,23.0,False
2020-07-28,3.960000,4.050000,3.920000,3.940000,3.940000,4555400,16.0,False
2020-07-29,3.940000,4.180000,3.920000,4.060000,4.060000,2879600,22.0,False
2020-07-30,4.000000,4.230000,3.970000,4.100000,4.100000,2398500,17.0,False
2020-07-31,4.060000,4.160000,3.990000,4.010000,4.010000,1879400,16.0,False
...,...,...,...,...,...,...,...,...
2021-07-13,187.679993,188.789993,179.000000,180.059998,180.059998,2397900,25.0,False
2021-07-14,180.490005,182.380005,165.070007,167.619995,167.619995,3913800,27.0,False
2021-07-15,160.000000,171.990005,158.009995,166.820007,166.820007,4298600,31.0,False
2021-07-16,170.149994,179.470001,166.300003,169.039993,169.039993,3278800,29.0,False


##### Prepare Data

In [505]:
# Rename search interest
GME_DF.rename(columns = {"GME": "Search Interest"},inplace = True)

# Add difference
GME_DF["Price Difference"] = GME_DF["Open"]-GME_DF["Close"]

# Add truth value that determines if we want to buy or not that day
GME_DF['Buy'] = np.where(GME_DF['Price Difference'] > 0, 1, 0)

# Delete isPartial

del GME_DF['isPartial']

In [506]:
# Check values - no nulls - int or float

GME_DF.info()

<class 'pandas.core.frame.DataFrame'>
Index: 250 entries, 2020-07-27 to 2021-07-19
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Open              250 non-null    float64
 1   High              250 non-null    float64
 2   Low               250 non-null    float64
 3   Close             250 non-null    float64
 4   Adj Close         250 non-null    float64
 5   Volume            250 non-null    int64  
 6   Search Interest   249 non-null    float64
 7   Price Difference  250 non-null    float64
 8   Buy               250 non-null    int64  
dtypes: float64(7), int64(2)
memory usage: 19.5+ KB


In [507]:
# Set features to target "Buy"

features = list(GME_DF.columns)
features.remove("Buy")
target = "Buy"

print(f"Feature categories: {features}")
print(f"Target feature: {target}")

Feature categories: ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'Search Interest', 'Price Difference']
Target feature: Buy


In [508]:
X = GME_DF[features]
X

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Search Interest,Price Difference
2020-07-27,4.020000,4.120000,3.950000,4.010000,4.010000,2472700,23.0,0.010000
2020-07-28,3.960000,4.050000,3.920000,3.940000,3.940000,4555400,16.0,0.020000
2020-07-29,3.940000,4.180000,3.920000,4.060000,4.060000,2879600,22.0,-0.120000
2020-07-30,4.000000,4.230000,3.970000,4.100000,4.100000,2398500,17.0,-0.100000
2020-07-31,4.060000,4.160000,3.990000,4.010000,4.010000,1879400,16.0,0.050000
...,...,...,...,...,...,...,...,...
2021-07-13,187.679993,188.789993,179.000000,180.059998,180.059998,2397900,25.0,7.619995
2021-07-14,180.490005,182.380005,165.070007,167.619995,167.619995,3913800,27.0,12.870010
2021-07-15,160.000000,171.990005,158.009995,166.820007,166.820007,4298600,31.0,-6.820007
2021-07-16,170.149994,179.470001,166.300003,169.039993,169.039993,3278800,29.0,1.110001


In [509]:
y = GME_DF[target]
y


2020-07-27    1
2020-07-28    1
2020-07-29    0
2020-07-30    0
2020-07-31    1
             ..
2021-07-13    1
2021-07-14    1
2021-07-15    0
2021-07-16    1
2021-07-19    0
Name: Buy, Length: 250, dtype: int64

In [510]:
# Set training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

print(f"Length of X_train (feature training set): {len(X_train)}")
print(f"Length of y_train (target training set): {len(y_train)}")
print(f"Length of X_test (feature test set): {len(X_test)}")
print(f"Length of y_test (target test set): {len(y_test)}")

Length of X_train (feature training set): 187
Length of y_train (target training set): 187
Length of X_test (feature test set): 63
Length of y_test (target test set): 63


##### Analysis 2

#### Apple (AAPL)
Ken Cupples

##### Get Data

##### Analysis 1

##### Analysis 2

### Shawn Sonnack
AMC

#### Purpose

We are looking to see if there is any correlation to google search interest with stock price changes.  In this data set we have a pulled a number from google between 0 - 100.  At 0 it means that there was little to no traffic compared to normal operation.  At 100 it means that the search traffic for that day was extremely high.

First I will dig in to see if search interest and the price it opens at can predict the close price of the day.

#### Common methods

In [511]:
def printMetrics(test, predictions):
    print(f"Score: {explained_variance_score(test, predictions):.2f}")
    print(f"MAE: {mean_absolute_error(test, predictions):.2f}")
    print(f"RMSE: {math.sqrt(mean_squared_error(test, predictions)):.2f}")
    print(f"r2: {r2_score(test, predictions):.2f}")

def printClassificationMetrics(test, predictions):
    print("Confusion Matrix:")
    print(confusion_matrix(test, predictions))
    print("------------------")
    print(f"Accuracy: {accuracy_score(test, predictions):.2f}")
    print(f"Recall: {recall_score(test, predictions):.2f}")
    print(f"Prediction: {precision_score(test, predictions):.2f}")
    print(f"f-measure: {fbeta_score(test, predictions, beta=1):.2f}")
    print("------------------")
    print(classification_report(test, predictions))


def prepareDataForPredictions(amcDataFrame):
    numElements = 3
    amcStockPreparedData = []
    for _ in range(numElements):
        dict = {}
        for column in X.columns:
            min = 0  # assume min = 0
            maxValue = round(max(amcDataFrame[column].values))
            dict[column] = rnd.randint(min, maxValue)
        amcStockPreparedData.append(dict)
    return amcStockPreparedData

#### Pull in prepared data for AMC stock: January 1 - June 30

In [512]:
amcMergedDataFrame = pd.read_csv('Data Files/AMCDataClean.zip')
amcMergedDataFrame

Unnamed: 0,Search Interest,Open,Close,Volume,Amount Changed,Days Spread,Price Increase,Search Interest Above Avg
0,2,2.200000,2.010000,29873800,0.190000,0.200000,1,0
1,3,1.990000,1.980000,28148300,0.010000,0.120000,1,0
2,2,2.030000,2.010000,67363300,0.020000,0.260000,1,0
3,2,2.080000,2.050000,26150500,0.030000,0.090000,1,0
4,3,2.090000,2.140000,39553300,-0.050000,0.140000,0,0
...,...,...,...,...,...,...,...,...
118,17,57.040001,58.299999,116291800,-1.259998,4.299999,0,1
119,16,57.980000,56.700001,80351200,1.279999,3.099998,1,1
120,19,55.750000,54.060001,77596900,1.689999,3.320000,1,1
121,16,55.099998,58.110001,99310200,-3.010002,5.029999,0,1


#### Linear Regression Setup

In [513]:
featureColumns=['Search Interest', 'Open']
targetColumn = 'Close'

X=amcMergedDataFrame[featureColumns]
y=amcMergedDataFrame[targetColumn]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

#### Create the regression

In [514]:
lr = LinearRegression()
lr

LinearRegression()

#### Fit Linear Model

In [515]:
lr.fit(X_train, y_train)

LinearRegression()

#### How confident are we in our model?

In [516]:
lr.score(X_train, y_train)

0.9691667374025036

In [517]:
lr.score(X_test, y_test)

0.9709538923793088

#### Print the prediction believed accuracy using the model

In [518]:
predictions = lr.predict(X_test)
printMetrics(y_test, predictions)

Score: 0.97
MAE: 1.91
RMSE: 3.13
r2: 0.97


#### Create new samples, to test our model

In [519]:
amcStockPreparedData = prepareDataForPredictions(amcMergedDataFrame)

#### Prepare the predictions for consumption

In [520]:
amcPreparedData = pd.DataFrame.from_dict(amcStockPreparedData)
amcPreparedData

Unnamed: 0,Search Interest,Open
0,2,30
1,49,20
2,66,6


#### Predict what the close price will be

In [521]:
predictions = lr.predict(amcPreparedData)
predictions

array([29.84559392, 22.07508896,  8.74029546])

#### Make it pretty

In [522]:
amcPredictedPrice = amcPreparedData.copy()
amcPredictedPrice['Price Prediction'] = predictions
amcPredictedPrice

Unnamed: 0,Search Interest,Open,Price Prediction
0,2,30,29.845594
1,49,20,22.075089
2,66,6,8.740295


#### Classical - Logistic Regression

In [523]:
amcMergedDataFrame

Unnamed: 0,Search Interest,Open,Close,Volume,Amount Changed,Days Spread,Price Increase,Search Interest Above Avg
0,2,2.200000,2.010000,29873800,0.190000,0.200000,1,0
1,3,1.990000,1.980000,28148300,0.010000,0.120000,1,0
2,2,2.030000,2.010000,67363300,0.020000,0.260000,1,0
3,2,2.080000,2.050000,26150500,0.030000,0.090000,1,0
4,3,2.090000,2.140000,39553300,-0.050000,0.140000,0,0
...,...,...,...,...,...,...,...,...
118,17,57.040001,58.299999,116291800,-1.259998,4.299999,0,1
119,16,57.980000,56.700001,80351200,1.279999,3.099998,1,1
120,19,55.750000,54.060001,77596900,1.689999,3.320000,1,1
121,16,55.099998,58.110001,99310200,-3.010002,5.029999,0,1


#### Prepare the Data and logistic Columns

In [524]:
amcMergedDataFrame['Price Increase'] = amcMergedDataFrame['Price Increase'].astype(int)
amcMergedDataFrame['Search Interest Above Avg'] = amcMergedDataFrame['Search Interest Above Avg'].astype(int)
logisticFeatureColumns=['Open', 'Close']
logisticTargetColumn = 'Search Interest Above Avg'

X=amcMergedDataFrame[logisticFeatureColumns]
y=amcMergedDataFrame[logisticTargetColumn]


In [525]:
y

0      0
1      0
2      0
3      0
4      0
      ..
118    1
119    1
120    1
121    1
122    1
Name: Search Interest Above Avg, Length: 123, dtype: int64

#### Train the model with my data from above

In [526]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

# Create the Logistic Regression

In [527]:
lr = LogisticRegression(solver="liblinear")
lr

LogisticRegression(solver='liblinear')

#### Fit the data

In [528]:
lr.fit(X_train, y_train)

LogisticRegression(solver='liblinear')

#### Score the model

In [529]:
lr.score(X_train, y_train)

0.9021739130434783

In [530]:
lr.score(X_test, y_test)

0.9032258064516129

#### Prepare the predictions

In [531]:
predictions = lr.predict(X_test)
printClassificationMetrics(y_test, predictions)

Confusion Matrix:
[[21  0]
 [ 3  7]]
------------------
Accuracy: 0.90
Recall: 0.70
Prediction: 1.00
f-measure: 0.82
------------------
              precision    recall  f1-score   support

           0       0.88      1.00      0.93        21
           1       1.00      0.70      0.82        10

    accuracy                           0.90        31
   macro avg       0.94      0.85      0.88        31
weighted avg       0.92      0.90      0.90        31



#### Create the data set to use for the predictions

In [532]:
amcStockPreparedData = prepareDataForPredictions(amcMergedDataFrame)
amcPreparedData = pd.DataFrame.from_dict(amcStockPreparedData)
amcPreparedData


Unnamed: 0,Open,Close
0,28,46
1,61,47
2,24,12


Use the dummy dataset to test our prediction

In [533]:
predictions = lr.predict(amcPreparedData)
predictions

array([1, 1, 1])

In [534]:
pdPredictedStockTrend = amcPreparedData
pdPredictedStockTrend["Search Interest Above Average"] = predictions.astype(bool)
pdPredictedStockTrend

Unnamed: 0,Open,Close,Search Interest Above Average
0,28,46,True
1,61,47,True
2,24,12,True


#### Conclusion

The linear regression model worked very well.  From the looks of it I was able to predict with extremely high accuracy what the close price would be for a stock based on the opening price, and search interest for the day.  Although in the real world this would be hard to get as search interest and the price changes are happening at the same time.  In my second model using logistical regression, I am not as confident with its prediction.  I tried to look from the other side to see if I could predict the search interest based on open and closed prices of AMC per day.  I felt like this would be interesting if it would conclude that search interest is high only when the stock is doing well.  This did not prove out as being a trend, but the again the accuracy of the model was high.




### Coke
Arielle Swift

#### Get Data

In [535]:
CokeDataSetQtr2 = get_data("KO")

# Name unnamed date column
CokeDataSetQtr2.reset_index(inplace=True)
CokeDataSetQtr2.rename(columns = {"index": "Date"},inplace = True)
CokeDataSetQtr2

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,KO,isPartial
0,2020-07-27,48.180000,48.509998,48.180000,48.480000,46.968781,17346500,82.0,False
1,2020-07-28,48.340000,49.279999,48.099998,48.180000,46.678131,13872700,82.0,False
2,2020-07-29,48.139999,48.500000,47.820000,48.020000,46.523117,13758100,82.0,False
3,2020-07-30,47.669998,48.230000,47.200001,47.689999,46.203403,17276500,88.0,False
4,2020-07-31,47.439999,47.770000,46.730000,47.240002,45.767437,14849200,89.0,False
...,...,...,...,...,...,...,...,...,...
245,2021-07-13,54.950001,55.299999,54.750000,55.020000,55.020000,15170800,88.0,False
246,2021-07-14,55.020000,56.349998,54.959999,56.259998,56.259998,22002700,91.0,False
247,2021-07-15,56.240002,56.470001,55.910000,56.439999,56.439999,15068200,85.0,False
248,2021-07-16,56.459999,56.680000,56.259998,56.400002,56.400002,14857600,91.0,False


In [536]:
CokeDataSetQtr2['Date']= pd.to_datetime(CokeDataSetQtr2['Date'],format='%Y-%m-%d')
CokeDataSetQtr2['Date'] = CokeDataSetQtr2['Date'].dt.strftime('%m-%d-%Y')
CokeDataSetQtr2


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,KO,isPartial
0,07-27-2020,48.180000,48.509998,48.180000,48.480000,46.968781,17346500,82.0,False
1,07-28-2020,48.340000,49.279999,48.099998,48.180000,46.678131,13872700,82.0,False
2,07-29-2020,48.139999,48.500000,47.820000,48.020000,46.523117,13758100,82.0,False
3,07-30-2020,47.669998,48.230000,47.200001,47.689999,46.203403,17276500,88.0,False
4,07-31-2020,47.439999,47.770000,46.730000,47.240002,45.767437,14849200,89.0,False
...,...,...,...,...,...,...,...,...,...
245,07-13-2021,54.950001,55.299999,54.750000,55.020000,55.020000,15170800,88.0,False
246,07-14-2021,55.020000,56.349998,54.959999,56.259998,56.259998,22002700,91.0,False
247,07-15-2021,56.240002,56.470001,55.910000,56.439999,56.439999,15068200,85.0,False
248,07-16-2021,56.459999,56.680000,56.259998,56.400002,56.400002,14857600,91.0,False


In [537]:
CokeDataSetQtr2 = CokeDataSetQtr2.loc[(CokeDataSetQtr2['Date'] >= '04-01-2021')
                     & (CokeDataSetQtr2['Date'] < '07-01-2021')]
CokeDataSetQtr2.reset_index(drop=True, inplace=True)

In [538]:
CokeDataSetQtr2['InvestToday'] = np.where(CokeDataSetQtr2.Close - CokeDataSetQtr2.Open>0, 1, 0)
CokeDataSetQtr2



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,KO,isPartial,InvestToday
0,04-01-2021,52.959999,53.150002,52.459999,52.509998,52.117294,15834700,84.0,False,0
1,04-05-2021,52.349998,53.220001,52.290001,52.810001,52.415054,16368700,87.0,False,1
2,04-06-2021,53.040001,53.650002,52.900002,53.189999,52.79221,15614300,83.0,False,1
3,04-07-2021,53.279999,53.5,53.119999,53.279999,52.881535,10062700,81.0,False,0
4,04-08-2021,53.169998,53.380001,52.970001,53.119999,52.722733,9695600,83.0,False,0
5,04-09-2021,53.169998,53.279999,52.810001,53.18,52.782284,10828200,85.0,False,1
6,04-12-2021,53.330002,53.549999,53.099998,53.349998,52.951012,8565300,82.0,False,1
7,04-13-2021,53.040001,53.290001,52.810001,53.09,52.692959,11071700,85.0,False,1
8,04-14-2021,52.98,53.189999,52.650002,53.080002,52.683033,9787600,78.0,False,1
9,04-15-2021,53.130001,53.66,53.119999,53.330002,52.931164,13078100,81.0,False,1


In [539]:
Continuous_Cols=[ 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']
Categorical_Cols=['InvestToday']

In [540]:
Predictor_Cols = Categorical_Cols + Continuous_Cols

Target_Col = 'InvestToday'

X=CokeDataSetQtr2[Continuous_Cols]
y=CokeDataSetQtr2[Target_Col]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)
print("Population:\n",y.value_counts(normalize=True)*100)
print("Train:\n", y_train.value_counts(normalize=True)*100)
print("Test:\n", y_test.value_counts(normalize=True)*100)

Population:
 1    51.5625
0    48.4375
Name: InvestToday, dtype: float64
Train:
 1    52.272727
0    47.727273
Name: InvestToday, dtype: float64
Test:
 0    50.0
1    50.0
Name: InvestToday, dtype: float64


In [541]:
knn = KNeighborsClassifier(n_neighbors=3)

In [542]:
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=3)

In [543]:
knn.score(X_train, y_train)

0.7045454545454546

In [544]:
knn.score(X_test, y_test)

0.5

#### Confusion Matrix

In [545]:
def printMetrics(test, predictions):
    print("Confusion Matrix:")
    print(confusion_matrix(test, predictions))
    print("------------------")
    print(f"Accuracy: {accuracy_score(test, predictions):.2f}")
    print(f"Recall: {recall_score(test, predictions):.2f}")
    print(f"Prediction: {precision_score(test, predictions):.2f}")
    print(f"f-measure: {fbeta_score(test, predictions, beta=1):.2f}")
    print("------------------")
    print(classification_report(test, predictions))

In [546]:
predictions = knn.predict(X_test)
printMetrics(y_test, predictions)

Confusion Matrix:
[[7 3]
 [7 3]]
------------------
Accuracy: 0.50
Recall: 0.30
Prediction: 0.50
f-measure: 0.37
------------------
              precision    recall  f1-score   support

           0       0.50      0.70      0.58        10
           1       0.50      0.30      0.37        10

    accuracy                           0.50        20
   macro avg       0.50      0.50      0.48        20
weighted avg       0.50      0.50      0.48        20



In [547]:
CokeDataSetQtr2

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,KO,isPartial,InvestToday
0,04-01-2021,52.959999,53.150002,52.459999,52.509998,52.117294,15834700,84.0,False,0
1,04-05-2021,52.349998,53.220001,52.290001,52.810001,52.415054,16368700,87.0,False,1
2,04-06-2021,53.040001,53.650002,52.900002,53.189999,52.79221,15614300,83.0,False,1
3,04-07-2021,53.279999,53.5,53.119999,53.279999,52.881535,10062700,81.0,False,0
4,04-08-2021,53.169998,53.380001,52.970001,53.119999,52.722733,9695600,83.0,False,0
5,04-09-2021,53.169998,53.279999,52.810001,53.18,52.782284,10828200,85.0,False,1
6,04-12-2021,53.330002,53.549999,53.099998,53.349998,52.951012,8565300,82.0,False,1
7,04-13-2021,53.040001,53.290001,52.810001,53.09,52.692959,11071700,85.0,False,1
8,04-14-2021,52.98,53.189999,52.650002,53.080002,52.683033,9787600,78.0,False,1
9,04-15-2021,53.130001,53.66,53.119999,53.330002,52.931164,13078100,81.0,False,1


In [548]:
numElements = 10
SampleStock = []
for _ in range(numElements):
    dict = {}
    for column in X.columns:
        min = 0
        maxValue = round(max(CokeDataSetQtr2[column].values))
        dict[column] = rnd.randint(min, maxValue)
    SampleStock.append(dict)
SampleStock

[{'Open': 45,
  'High': 47,
  'Low': 55,
  'Close': 8,
  'Adj Close': 51,
  'Volume': 26140571},
 {'Open': 6,
  'High': 26,
  'Low': 9,
  'Close': 27,
  'Adj Close': 30,
  'Volume': 29944978},
 {'Open': 47,
  'High': 33,
  'Low': 53,
  'Close': 22,
  'Adj Close': 39,
  'Volume': 40794907},
 {'Open': 5,
  'High': 2,
  'Low': 5,
  'Close': 40,
  'Adj Close': 49,
  'Volume': 5508698},
 {'Open': 6,
  'High': 23,
  'Low': 44,
  'Close': 21,
  'Adj Close': 55,
  'Volume': 227961},
 {'Open': 23,
  'High': 19,
  'Low': 11,
  'Close': 8,
  'Adj Close': 10,
  'Volume': 26115674},
 {'Open': 18,
  'High': 10,
  'Low': 55,
  'Close': 5,
  'Adj Close': 14,
  'Volume': 55051307},
 {'Open': 37,
  'High': 7,
  'Low': 29,
  'Close': 43,
  'Adj Close': 35,
  'Volume': 1591227},
 {'Open': 44,
  'High': 26,
  'Low': 51,
  'Close': 1,
  'Adj Close': 52,
  'Volume': 1439518},
 {'Open': 56,
  'High': 15,
  'Low': 48,
  'Close': 53,
  'Adj Close': 9,
  'Volume': 22392310}]

In [549]:
pdSampleStock = pd.DataFrame.from_dict(SampleStock)
predictions = knn.predict(pdSampleStock)
predictions

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [550]:
pdPredictedInvest = pdSampleStock
pdPredictedInvest["InvestToday?"] = predictions.astype(bool)
pdPredictedInvest

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,InvestToday?
0,45,47,55,8,51,26140571,True
1,6,26,9,27,30,29944978,True
2,47,33,53,22,39,40794907,True
3,5,2,5,40,49,5508698,True
4,6,23,44,21,55,227961,True
5,23,19,11,8,10,26115674,True
6,18,10,55,5,14,55051307,True
7,37,7,29,43,35,1591227,True
8,44,26,51,1,52,1439518,True
9,56,15,48,53,9,22392310,True


In [551]:
lr = LogisticRegression(solver="liblinear")
lr

LogisticRegression(solver='liblinear')

In [552]:
lr.fit(X_train, y_train)


LogisticRegression(solver='liblinear')

In [553]:
lr.score(X_train, y_train)


0.5227272727272727

In [554]:
lr.score(X_test, y_test)

0.5

In [555]:
predictions = lr.predict(X_test)

In [556]:
fig = px.scatter(CokeDataSetQtr2, x='Close', y='Open',color='InvestToday')
fig.show()

### Tesla (TSLA)
Andrew T.

In [557]:
pytrends = TrendReq(hl='en-US', tz=360)

#build list of keywords in this case only use Teslas
kw_list = ["Bitcoin"]

# build the payload
pytrends.build_payload(kw_list, timeframe='2021-03-31 2021-06-29', geo='US')

# store interest over time information in df and rename Tesla column to Search Interest
bitcoinTrendsdf = pytrends.interest_over_time()
bitcoinTrendsdf = bitcoinTrendsdf.rename(columns={'Bitcoin': 'Previous_Search_Interest'})
#telsaStockdf.set_index('date': 'Date', inplace=True)
bitcoinTrendsdf.reset_index(inplace=True, drop=True)
bitcoinTrendsdf

Unnamed: 0,Previous_Search_Interest,isPartial
0,29,False
1,41,False
2,29,False
3,22,False
4,22,False
5,23,False
6,24,False
7,23,False
8,24,False
9,22,False


In [558]:
bitcoinPricedf = pd.read_csv(f"{dataDir}BTC-USD.csv")
bitcoinPreviousPricedf = pd.read_csv(f"{dataDir}BTC-USD-Previous.csv")
mergedPrice = pd.concat([bitcoinPricedf, bitcoinPreviousPricedf], axis=1)
bitcoinPreviousPricedf

Unnamed: 0,Previous_Open,Previous_High,Previous_Low,Previous_Close,Previous_Adj_Close,Previous_Volume
0,58930.277344,59930.027344,57726.417969,58918.832031,58918.832031,65520826225
1,58926.5625,59586.070313,58505.277344,59095.808594,59095.808594,61669163792
2,59098.878906,60267.1875,58869.28125,59384.3125,59384.3125,58727860620
3,59397.410156,60110.269531,57603.890625,57603.890625,57603.890625,59641344484
4,57604.839844,58913.746094,57168.675781,58758.554688,58758.554688,50749662970
5,58760.875,59891.296875,57694.824219,59057.878906,59057.878906,60706272115
6,59171.933594,59479.578125,57646.808594,58192.359375,58192.359375,66058027988
7,58186.507813,58731.144531,55604.023438,56048.9375,56048.9375,75645303584
8,56099.914063,58338.738281,55879.085938,58323.953125,58323.953125,53053855641
9,58326.5625,58937.046875,57807.863281,58245.003906,58245.003906,46655208546


In [559]:
mergedStockPrice = pd.concat([mergedPrice, bitcoinTrendsdf], axis=1)
#mergedStockPrice.reset_index(inplace=True, drop=True)
mergedStockPrice.set_index('isPartial', drop=True)
#mergedStockPrice['Previous_Close'] = mergedStockPrice['Close']
#meanSearchInterest = mergedStockPrice['Search_Interest'].mean()
#mergedStockPrice["Interest Points Away From Mean"] = mergedStockPrice["Search Interest"] - meanSearchInterest
mergedStockPrice["Price_Increase"] = mergedStockPrice["Open"] - mergedStockPrice["Close"] > 0.0
mergedStockPrice["Price_Increase"] = mergedStockPrice["Price_Increase"]*1


#pd.set_option('display.max_rows', len(mergedStockPrice))

mergedStockPrice

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Previous_Open,Previous_High,Previous_Low,Previous_Close,Previous_Adj_Close,Previous_Volume,Previous_Search_Interest,isPartial,Price_Increase
0,2021-04-01,58926.5625,59586.070313,58505.277344,59095.808594,59095.808594,61669163792,58930.277344,59930.027344,57726.417969,58918.832031,58918.832031,65520826225,29,False,0
1,2021-04-02,59098.878906,60267.1875,58869.28125,59384.3125,59384.3125,58727860620,58926.5625,59586.070313,58505.277344,59095.808594,59095.808594,61669163792,41,False,0
2,2021-04-03,59397.410156,60110.269531,57603.890625,57603.890625,57603.890625,59641344484,59098.878906,60267.1875,58869.28125,59384.3125,59384.3125,58727860620,29,False,1
3,2021-04-04,57604.839844,58913.746094,57168.675781,58758.554688,58758.554688,50749662970,59397.410156,60110.269531,57603.890625,57603.890625,57603.890625,59641344484,22,False,0
4,2021-04-05,58760.875,59891.296875,57694.824219,59057.878906,59057.878906,60706272115,57604.839844,58913.746094,57168.675781,58758.554688,58758.554688,50749662970,22,False,0
5,2021-04-06,59171.933594,59479.578125,57646.808594,58192.359375,58192.359375,66058027988,58760.875,59891.296875,57694.824219,59057.878906,59057.878906,60706272115,23,False,1
6,2021-04-07,58186.507813,58731.144531,55604.023438,56048.9375,56048.9375,75645303584,59171.933594,59479.578125,57646.808594,58192.359375,58192.359375,66058027988,24,False,1
7,2021-04-08,56099.914063,58338.738281,55879.085938,58323.953125,58323.953125,53053855641,58186.507813,58731.144531,55604.023438,56048.9375,56048.9375,75645303584,23,False,0
8,2021-04-09,58326.5625,58937.046875,57807.863281,58245.003906,58245.003906,46655208546,56099.914063,58338.738281,55879.085938,58323.953125,58323.953125,53053855641,24,False,1
9,2021-04-10,58253.777344,61276.664063,58038.707031,59793.234375,59793.234375,58238470525,58326.5625,58937.046875,57807.863281,58245.003906,58245.003906,46655208546,22,False,0


In [560]:
mergedStockPrice.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91 entries, 0 to 90
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Date                      91 non-null     object 
 1   Open                      91 non-null     float64
 2   High                      91 non-null     float64
 3   Low                       91 non-null     float64
 4   Close                     91 non-null     float64
 5   Adj Close                 91 non-null     float64
 6   Volume                    91 non-null     int64  
 7   Previous_Open             91 non-null     float64
 8   Previous_High             91 non-null     float64
 9   Previous_Low              91 non-null     float64
 10  Previous_Close            91 non-null     float64
 11  Previous_Adj_Close        91 non-null     float64
 12  Previous_Volume           91 non-null     int64  
 13  Previous_Search_Interest  91 non-null     int64  
 14  isPartial   

In [561]:
#columns = ["Open", "High", "Low", "Close", "Search Interest", "Interest Points Away From Mean", "Price Increase Points", "Price Increase"]
columns = ["Previous_Open", "Previous_High", "Previous_Low", "Previous_Close", "Previous_Volume", "Previous_Search_Interest", "Close"]
mergedStockPrice = mergedStockPrice[columns]
mergedStockPrice

Unnamed: 0,Previous_Open,Previous_High,Previous_Low,Previous_Close,Previous_Volume,Previous_Search_Interest,Close
0,58930.277344,59930.027344,57726.417969,58918.832031,65520826225,29,59095.808594
1,58926.5625,59586.070313,58505.277344,59095.808594,61669163792,41,59384.3125
2,59098.878906,60267.1875,58869.28125,59384.3125,58727860620,29,57603.890625
3,59397.410156,60110.269531,57603.890625,57603.890625,59641344484,22,58758.554688
4,57604.839844,58913.746094,57168.675781,58758.554688,50749662970,22,59057.878906
5,58760.875,59891.296875,57694.824219,59057.878906,60706272115,23,58192.359375
6,59171.933594,59479.578125,57646.808594,58192.359375,66058027988,24,56048.9375
7,58186.507813,58731.144531,55604.023438,56048.9375,75645303584,23,58323.953125
8,56099.914063,58338.738281,55879.085938,58323.953125,53053855641,24,58245.003906
9,58326.5625,58937.046875,57807.863281,58245.003906,46655208546,22,59793.234375


In [562]:
features = list(mergedStockPrice.columns)
features.remove("Close")
target = "Close"

X = mergedStockPrice[features]
y = mergedStockPrice[target]

In [563]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

In [564]:
lr = LinearRegression()
lr

LinearRegression()

In [565]:
lr.fit(X_train, y_train)

LinearRegression()

In [566]:
lr.score(X_train, y_train)

0.9600860921909471

In [567]:
lr.score(X_test, y_test)

0.9614613962909508

In [568]:

def printMetrics(test, predictions):
    print(f"Score: {explained_variance_score(test, predictions):.2f}")
    print(f"MAE: {mean_absolute_error(test, predictions):.2f}")
    print(f"RMSE: {math.sqrt(mean_squared_error(test, predictions)):.2f}")
    print(f"r2: {r2_score(test, predictions):.2f}")

In [569]:
predictions = lr.predict(X_test)
printMetrics(y_test, predictions)

Score: 0.96
MAE: 1573.91
RMSE: 1988.37
r2: 0.96


In [570]:
numElements = 3
samplePrice = []
for _ in range(numElements):
    dict = {}
    for column in X.columns:
        min = 0  # We'll always allow at lea
        maxValue = round(max(mergedStockPrice[column].values))
        dict[column] = rnd.randint(min, maxValue)
    samplePrice.append(dict)
samplePrice


[{'Previous_Open': 22546,
  'Previous_High': 30235,
  'Previous_Low': 57402,
  'Previous_Close': 40003,
  'Previous_Volume': 73762452307,
  'Previous_Search_Interest': 88},
 {'Previous_Open': 52291,
  'Previous_High': 52291,
  'Previous_Low': 27932,
  'Previous_Close': 3989,
  'Previous_Volume': 12711220807,
  'Previous_Search_Interest': 49},
 {'Previous_Open': 56616,
  'Previous_High': 6447,
  'Previous_Low': 47647,
  'Previous_Close': 39086,
  'Previous_Volume': 23145546886,
  'Previous_Search_Interest': 27}]

In [571]:
pdSamplePrice = pd.DataFrame.from_dict(samplePrice)
pdSamplePrice


Unnamed: 0,Previous_Open,Previous_High,Previous_Low,Previous_Close,Previous_Volume,Previous_Search_Interest
0,22546,30235,57402,40003,73762452307,88
1,52291,52291,27932,3989,12711220807,49
2,56616,6447,47647,39086,23145546886,27


In [572]:
predictions = lr.predict(pdSamplePrice)
predictions



array([ 8182.24471276,  8411.19921464, 42528.14081048])

In [573]:
pdSamplePrice = pdSamplePrice.copy()
pdSamplePrice['Predicted'] = predictions
pdSamplePrice



Unnamed: 0,Previous_Open,Previous_High,Previous_Low,Previous_Close,Previous_Volume,Previous_Search_Interest,Predicted
0,22546,30235,57402,40003,73762452307,88,8182.244713
1,52291,52291,27932,3989,12711220807,49,8411.199215
2,56616,6447,47647,39086,23145546886,27,42528.14081


# Logarithmic Regression

In [574]:
def printClassificationMetrics(test, predictions):
    print("Confusion Matrix:")
    print(confusion_matrix(test, predictions))
    print("------------------")
    print(f"Accuracy: {accuracy_score(test, predictions):.2f}")
    print(f"Recall: {recall_score(test, predictions):.2f}")
    print(f"Prediction: {precision_score(test, predictions):.2f}")
    print(f"f-measure: {fbeta_score(test, predictions, beta=1):.2f}")
    print("------------------")
    print(classification_report(test, predictions))

In [575]:
pd.set_option('display.max_rows', len(mergedStockPrice))

mergedStockPrice["Price_Increase"] = bitcoinPricedf["Open"] - bitcoinPricedf["Close"] > 0.0
mergedStockPrice['Price_Increase'] = mergedStockPrice.Price_Increase.astype(int)
mergedStockPrice




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Previous_Open,Previous_High,Previous_Low,Previous_Close,Previous_Volume,Previous_Search_Interest,Close,Price_Increase
0,58930.277344,59930.027344,57726.417969,58918.832031,65520826225,29,59095.808594,0
1,58926.5625,59586.070313,58505.277344,59095.808594,61669163792,41,59384.3125,0
2,59098.878906,60267.1875,58869.28125,59384.3125,58727860620,29,57603.890625,1
3,59397.410156,60110.269531,57603.890625,57603.890625,59641344484,22,58758.554688,0
4,57604.839844,58913.746094,57168.675781,58758.554688,50749662970,22,59057.878906,0
5,58760.875,59891.296875,57694.824219,59057.878906,60706272115,23,58192.359375,1
6,59171.933594,59479.578125,57646.808594,58192.359375,66058027988,24,56048.9375,1
7,58186.507813,58731.144531,55604.023438,56048.9375,75645303584,23,58323.953125,0
8,56099.914063,58338.738281,55879.085938,58323.953125,53053855641,24,58245.003906,1
9,58326.5625,58937.046875,57807.863281,58245.003906,46655208546,22,59793.234375,0


In [576]:

columns = ["Previous_Open", "Previous_Close", "Previous_Search_Interest", "Price_Increase"]
mergedStockPrice = mergedStockPrice[columns]
mergedStockPrice

features = list(mergedStockPrice.columns)
features.remove("Price_Increase")
target = "Price_Increase"

X = mergedStockPrice[features]
y = mergedStockPrice[target]

In [577]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [578]:


logReg = LogisticRegression(solver="liblinear")
logReg

LogisticRegression(solver='liblinear')

In [579]:
logReg.fit(X_train, y_train)

LogisticRegression(solver='liblinear')

In [580]:
logReg.score(X_train, y_train)

0.5588235294117647

In [581]:
logReg.score(X_test, y_test)

0.6521739130434783

In [582]:
predictions = logReg.predict(X_test)
printMetrics(y_test, predictions)

Score: -0.50
MAE: 0.35
RMSE: 0.59
r2: -0.53


In [583]:
predictions = logReg.predict(X_test)
printClassificationMetrics(y_test, predictions)



Confusion Matrix:
[[ 3  5]
 [ 3 12]]
------------------
Accuracy: 0.65
Recall: 0.80
Prediction: 0.71
f-measure: 0.75
------------------
              precision    recall  f1-score   support

           0       0.50      0.38      0.43         8
           1       0.71      0.80      0.75        15

    accuracy                           0.65        23
   macro avg       0.60      0.59      0.59        23
weighted avg       0.63      0.65      0.64        23



In [584]:
pdSamplePrice = pd.DataFrame.from_dict(pdSamplePrice)

predictions = logReg.predict(pdSamplePrice)

pdPredicted = pdSamplePrice
pdPredicted["Price_Increase"] = predictions.astype(bool)
pdPredicted

ValueError: X has 7 features per sample; expecting 3

### John Deere (DE)
Dan Knobloch

##### Helper methods

In [None]:
def createCategoricalDummies(dataFrame, categoryList):
    return pd.get_dummies(dataFrame[categoryList], prefix_sep = "::", drop_first = True)

In [None]:
def printRegressionMetrics(test, predictions):
    print(f"Score: {explained_variance_score(test, predictions):.2f}")
    print(f"MAE: {mean_absolute_error(test, predictions):.2f}")
    print(f"RMSE: {math.sqrt(mean_squared_error(test, predictions)):.2f}")
    print(f"r2: {r2_score(test, predictions):.2f}")

In [None]:
def printClassificationMetrics(test, predictions):
    print("Confusion Matrix:")
    print(confusion_matrix(test, predictions))
    print("------------------")
    print(f"Accuracy: {accuracy_score(test, predictions):.2f}")
    print(f"Recall: {recall_score(test, predictions):.2f}")
    print(f"Prediction: {precision_score(test, predictions):.2f}")
    print(f"f-measure: {fbeta_score(test, predictions, beta=1):.2f}")
    print("------------------")
    print(classification_report(test, predictions))

#### Part 1: Regression - Linear Regression
##### Prepare the Data
load data, clean up, prepare features and targets, split up training and test data.

In [None]:
# read file, drop null values, convert binary values in trend and price to integers
JDStockTrend = pd.read_csv(f"{dataDir}DeereStockPrice.csv")
JDStockTrend.dropna(inplace=True)
JDStockTrend['trend_daily_increase'] = JDStockTrend.trend_daily_increase.astype(int)
JDStockTrend['price_daily_increase'] = JDStockTrend.price_daily_increase.astype(int)
JDStockTrend

In [None]:
# determine feature vectors and target vectors. in this case targeting a prediction of price based on the previous days closing price, volume traded, and trends data
featureColumns = ["Trends","Previous_Close","Volume","trend_daily_increase"]
target = "Price"

X=JDStockTrend[featureColumns]
y=JDStockTrend[target]

X.info()

In [None]:
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

## Modeling with Linear Regression

fit a line to our data set, with the minimum distance between the points.

In [None]:
lr = LinearRegression()    #use this algorithm to start developing the line betwee data points
lr

In [None]:
lr.fit(X_train, y_train)
trainScore = lr.score(X_train, y_train)
testScore = lr.score(X_test, y_test)

print(f'the score with the training data set = {trainScore}')
print(f'the score with the test data set = {testScore}')

## LR Metrics Output

In [None]:
predictions = lr.predict(X_test)
printRegressionMetrics(y_test, predictions)

#### Predict some new samples

define a few new samples.

In [None]:
numElements = 3
sampleStockTrend = []
for _ in range(numElements):
    dict = {}
    for column in X.columns:
        min = 0  # assume min = 0
        maxValue = round(max(JDStockTrend[column].values))
        dict[column] = rnd.randint(min, maxValue)
    sampleStockTrend.append(dict)
sampleStockTrend

In [None]:
pdSampleStockTrend = pd.DataFrame.from_dict(sampleStockTrend)
pdSampleStockTrend

In [None]:
predictions = lr.predict(pdSampleStockTrend)
predictions

In [None]:
pdPredictedStockTrend = pdSampleStockTrend.copy()
pdPredictedStockTrend['Predicted'] = predictions
pdPredictedStockTrend

##### Part 2: Classification - Logistic Regression

##### Prepare the Data
load data, clean up, prepare features and targets, split up training and test data.

In [None]:
# Need to prepare the data seperately for logistic regression becuase our feature and target vectors will be different
JDStockTrend

In [None]:
#determine feature vectors and target vectors. in this case targeting a classification on whether or not the price would increase based on the previous days closing price, volume traded, and search trends data
featureColumns = ["Trends","Previous_Close","Volume","trend_daily_increase"]
target = "price_daily_increase"

X=JDStockTrend[featureColumns]
y=JDStockTrend[target]

X

In [None]:
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

#### Modeling with Logistic Regression (classification)


In [None]:
lr = LogisticRegression(solver="liblinear")
lr

In [None]:
lr.fit(X_train, y_train)

In [None]:
lr.score(X_train, y_train)

In [None]:
lr.score(X_test, y_test)

#### Classification Metrics Output

In [None]:
predictions = lr.predict(X_test)
printClassificationMetrics(y_test, predictions)

##### Predict with the samples that were generated above

In [None]:
sampleStockTrend

In [None]:
pdSampleStockTrend = pd.DataFrame.from_dict(sampleStockTrend)

In [None]:
predictions = lr.predict(pdSampleStockTrend)
predictions

In [None]:
pdPredictedStockTrend = pdSampleStockTrend
pdPredictedStockTrend["price_daily_increse"] = predictions.astype(bool)
pdPredictedStockTrend

## Conclusion

analyzing the current results of the both methods used (linear regression, and logistic regression (Classification). it does seem like the data points to the fact that the linear regrission is a strong canidate for predicting price of John Deere. in each linear regression scenario, the score is around 90%. and the sample data that is generated also seems to develop reasonable price predictions for the closing price. the logistic model (for classification) only has a score of 44%. overall. in some ways, developing a stronger classification model would provide more value for a day trader of stocks becuase it would give them the insight of the confidience to buy shares of a compnay to make money in a short amount of time.
