In [None]:
#import libraries
import pandas as pd
import numpy as np
from textblob import TextBlob
import re
import nltk
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


In [None]:
#load the transcript data for S&P 100 for 2020 only
#read csv that includes stock data
transcribed_list= pd.read_csv('transcribed_transcript_list2.csv')
transcribed_list.head(10)

In [None]:
#store the data
#transcript_prep_remarks
transcribed_list_org= transcribed_list

In [4]:
transcribed_list_org.dtypes

Unnamed: 0                  int64
date                       object
ticker                     object
title                      object
transcript                 object
transcript_prep_remarks    object
url                        object
dtype: object

In [5]:
#correct earnings call date to correctly associate in the join
transcribed_list_org['date_rev2']= pd.to_datetime(transcribed_list_org['date'])
transcribed_list_org['call_date']= transcribed_list_org['date_rev2'].dt.strftime('%m/%d/%Y')

In [7]:
transcribed_list_org.head(5)

Unnamed: 0.1,Unnamed: 0,date,ticker,title,transcript,transcript_prep_remarks,url,date_rev2,call_date
0,0,10/29/2020,AAPL,Apple Inc. (AAPL) CEO Tim Cook on Q4 2020 Resu...,Apple Inc. (NASDAQ:AAPL) Q4 2020 Earnings Conf...,"['Operator', 'Good day everyone and welcome to...",/article/4382943-apple-inc-aapl-ceo-tim-cook-o...,2020-10-29,10/29/2020
1,1,7/30/2020,AAPL,Apple Inc. (AAPL) CEO Tim Cook on Q3 2020 Resu...,Apple Inc. (NASDAQ:AAPL) Q3 2020 Results Confe...,"['Operator', ""Good day, everyone. Welcome to t...",/article/4362707-apple-inc-aapl-ceo-tim-cook-o...,2020-07-30,07/30/2020
2,2,4/30/2020,AAPL,Apple Inc. (AAPL) CEO Tim Cook on Q2 2020 Resu...,Apple Inc. (NASDAQ:AAPL) Q2 2020 Results Confe...,"['Operator', 'Good day everyone. Welcome to th...",/article/4341792-apple-inc-aapl-ceo-tim-cook-o...,2020-04-30,04/30/2020
3,3,1/28/2020,AAPL,Apple Inc. (AAPL) CEO Tim Cook on Q1 2020 Resu...,Apple Inc. (NASDAQ:AAPL) Q1 2020 Results Confe...,"['Operator', ""Good day, everyone. Welcome to t...",/article/4319666-apple-inc-aapl-ceo-tim-cook-o...,2020-01-28,01/28/2020
4,8,10/30/2020,ABBV,AbbVie Inc.'s (ABBV) CEO Rick Gonzalez on Q3 2...,AbbVie Inc. (NYSE:ABBV) Q3 2020 Results Earnin...,"['Operator', 'Good morning and thank you for s...",/article/4383381-abbvie-inc-s-abbv-ceo-rick-go...,2020-10-30,10/30/2020


In [8]:
transcribed_list_org= transcribed_list_org.rename(columns={'ticker': 'Symbol'})
transcribed_list_org.head(3)

Unnamed: 0.1,Unnamed: 0,date,Symbol,title,transcript,transcript_prep_remarks,url,date_rev2,call_date
0,0,10/29/2020,AAPL,Apple Inc. (AAPL) CEO Tim Cook on Q4 2020 Resu...,Apple Inc. (NASDAQ:AAPL) Q4 2020 Earnings Conf...,"['Operator', 'Good day everyone and welcome to...",/article/4382943-apple-inc-aapl-ceo-tim-cook-o...,2020-10-29,10/29/2020
1,1,7/30/2020,AAPL,Apple Inc. (AAPL) CEO Tim Cook on Q3 2020 Resu...,Apple Inc. (NASDAQ:AAPL) Q3 2020 Results Confe...,"['Operator', ""Good day, everyone. Welcome to t...",/article/4362707-apple-inc-aapl-ceo-tim-cook-o...,2020-07-30,07/30/2020
2,2,4/30/2020,AAPL,Apple Inc. (AAPL) CEO Tim Cook on Q2 2020 Resu...,Apple Inc. (NASDAQ:AAPL) Q2 2020 Results Confe...,"['Operator', 'Good day everyone. Welcome to th...",/article/4341792-apple-inc-aapl-ceo-tim-cook-o...,2020-04-30,04/30/2020


In [9]:
#create a function to get the subjectivity
def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

In [10]:
#create function to get the polarity
def getPolarity(text):
    return TextBlob(text).sentiment.polarity


In [11]:
#create new columns for subjectivity and polarity on MERGED DATA

transcribed_list_org['Subjectivity']= transcribed_list_org['transcript'].apply(getSubjectivity)

transcribed_list_org['Polarity']= transcribed_list_org['transcript'].apply(getPolarity)

In [12]:
transcribed_list_org['Subjectivity'][0]

0.4755678464000953

In [13]:
#create function to get sentiment score using Sentiment Intensity Analyzer

def getSIA(text):
    sia=SentimentIntensityAnalyzer()
    
    sentiment=sia.polarity_scores(text)
    return sentiment

In [15]:
compound=[]
neg= []
pos = []
neu = []
SIA = 0

for i in range(0, len(transcribed_list_org['transcript'])):
    SIA = getSIA(transcribed_list_org['transcript'][i])
    compound.append(SIA['compound'])
    neg.append(SIA['neg'])
    neu.append(SIA['neu'])
    pos.append(SIA['pos'])

In [16]:
pos[:3]
compound[:3]
neg[:3]
neu[:3]
SIA

{'neg': 0.025, 'neu': 0.818, 'pos': 0.156, 'compound': 1.0}

In [17]:
#store sentiment scores in dataframe
transcribed_list_org['Compound']=compound
transcribed_list_org['Negative']=neg
transcribed_list_org['Positive']=pos
transcribed_list_org['Neutral']=neu

In [18]:
transcribed_list_org.head(3)

Unnamed: 0.1,Unnamed: 0,date,Symbol,title,transcript,transcript_prep_remarks,url,date_rev2,call_date,Subjectivity,Polarity,Compound,Negative,Positive,Neutral
0,0,10/29/2020,AAPL,Apple Inc. (AAPL) CEO Tim Cook on Q4 2020 Resu...,Apple Inc. (NASDAQ:AAPL) Q4 2020 Earnings Conf...,"['Operator', 'Good day everyone and welcome to...",/article/4382943-apple-inc-aapl-ceo-tim-cook-o...,2020-10-29,10/29/2020,0.475568,0.166736,1.0,0.032,0.151,0.818
1,1,7/30/2020,AAPL,Apple Inc. (AAPL) CEO Tim Cook on Q3 2020 Resu...,Apple Inc. (NASDAQ:AAPL) Q3 2020 Results Confe...,"['Operator', ""Good day, everyone. Welcome to t...",/article/4362707-apple-inc-aapl-ceo-tim-cook-o...,2020-07-30,07/30/2020,0.45936,0.165622,1.0,0.023,0.17,0.807
2,2,4/30/2020,AAPL,Apple Inc. (AAPL) CEO Tim Cook on Q2 2020 Resu...,Apple Inc. (NASDAQ:AAPL) Q2 2020 Results Confe...,"['Operator', 'Good day everyone. Welcome to th...",/article/4341792-apple-inc-aapl-ceo-tim-cook-o...,2020-04-30,04/30/2020,0.43226,0.171578,1.0,0.024,0.146,0.83


In [19]:
transcribed_list_org.columns

Index(['Unnamed: 0', 'date', 'Symbol', 'title', 'transcript',
       'transcript_prep_remarks', 'url', 'date_rev2', 'call_date',
       'Subjectivity', 'Polarity', 'Compound', 'Negative', 'Positive',
       'Neutral'],
      dtype='object')

In [20]:
#display sentiment values to determine category value which will be categorized as 1 for positive values over 0.10 and 0 is under 0.10 since average is 0.143419
transcribed_list_org[['Subjectivity', 'Polarity','Negative', 'Positive', 'Neutral']].describe()

Unnamed: 0,Subjectivity,Polarity,Negative,Positive,Neutral
count,401.0,401.0,401.0,401.0,401.0
mean,0.425835,0.15309,0.025494,0.150631,0.823888
std,0.028044,0.038081,0.009832,0.020824,0.019935
min,0.332347,0.066595,0.0,0.102,0.775
25%,0.408109,0.132543,0.019,0.137,0.81
50%,0.426704,0.14996,0.024,0.149,0.823
75%,0.446184,0.168558,0.032,0.166,0.836
max,0.502122,0.5,0.058,0.207,0.898


In [66]:

print(transcribed_list_org['Positive'].max())
print(transcribed_list_org['Positive'].min())

0.207
0.102


In [67]:
#made label column that assigns 1 to positive values over 0.10 and 0 is under 0.10
transcribed_list_org['Label']= np.where(transcribed_list_org['Positive']>0.15, 1, 0)

In [68]:
transcribed_list_org['Label'].value_counts()

0    210
1    191
Name: Label, dtype: int64

In [69]:
transcribed_list_org.head(10)

Unnamed: 0.1,Unnamed: 0,date,Symbol,title,transcript,transcript_prep_remarks,url,date_rev2,call_date,Subjectivity,Polarity,Compound,Negative,Positive,Neutral,Label
0,0,10/29/2020,AAPL,Apple Inc. (AAPL) CEO Tim Cook on Q4 2020 Resu...,Apple Inc. (NASDAQ:AAPL) Q4 2020 Earnings Conf...,"['Operator', 'Good day everyone and welcome to...",/article/4382943-apple-inc-aapl-ceo-tim-cook-o...,2020-10-29,10/29/2020,0.475568,0.166736,1.0,0.032,0.151,0.818,1
1,1,7/30/2020,AAPL,Apple Inc. (AAPL) CEO Tim Cook on Q3 2020 Resu...,Apple Inc. (NASDAQ:AAPL) Q3 2020 Results Confe...,"['Operator', ""Good day, everyone. Welcome to t...",/article/4362707-apple-inc-aapl-ceo-tim-cook-o...,2020-07-30,07/30/2020,0.45936,0.165622,1.0,0.023,0.17,0.807,1
2,2,4/30/2020,AAPL,Apple Inc. (AAPL) CEO Tim Cook on Q2 2020 Resu...,Apple Inc. (NASDAQ:AAPL) Q2 2020 Results Confe...,"['Operator', 'Good day everyone. Welcome to th...",/article/4341792-apple-inc-aapl-ceo-tim-cook-o...,2020-04-30,04/30/2020,0.43226,0.171578,1.0,0.024,0.146,0.83,0
3,3,1/28/2020,AAPL,Apple Inc. (AAPL) CEO Tim Cook on Q1 2020 Resu...,Apple Inc. (NASDAQ:AAPL) Q1 2020 Results Confe...,"['Operator', ""Good day, everyone. Welcome to t...",/article/4319666-apple-inc-aapl-ceo-tim-cook-o...,2020-01-28,01/28/2020,0.44764,0.177873,1.0,0.022,0.154,0.825,1
4,8,10/30/2020,ABBV,AbbVie Inc.'s (ABBV) CEO Rick Gonzalez on Q3 2...,AbbVie Inc. (NYSE:ABBV) Q3 2020 Results Earnin...,"['Operator', 'Good morning and thank you for s...",/article/4383381-abbvie-inc-s-abbv-ceo-rick-go...,2020-10-30,10/30/2020,0.450441,0.16044,1.0,0.027,0.148,0.826,0
5,10,7/31/2020,ABBV,AbbVie Inc. (ABBV) CEO Rick Gonzalez on Q2 202...,AbbVie Inc. (NYSE:ABBV) Q2 2020 Earnings Confe...,"['Operator', 'Good morning. And thank you for ...",/article/4363134-abbvie-inc-abbv-ceo-rick-gonz...,2020-07-31,07/31/2020,0.433294,0.145384,1.0,0.026,0.144,0.83,0
6,11,5/1/2020,ABBV,AbbVie Inc. (ABBV) CEO Rick Gonzalez on Q1 202...,AbbVie Inc. (NYSE:ABBV) Q1 2020 Earnings Confe...,"['Operator', 'Good morning and thank you for s...",/article/4342124-abbvie-inc-abbv-ceo-rick-gonz...,2020-05-01,05/01/2020,0.437594,0.142108,1.0,0.032,0.129,0.839,0
7,12,2/7/2020,ABBV,AbbVie Inc. (ABBV) CEO Rick Gonzalez on Q4 201...,AbbVie Inc. (NYSE:ABBV) Q4 2019 Earnings Confe...,"['Operator', 'Good morning and thank you for s...",/article/4322499-abbvie-inc-abbv-ceo-rick-gonz...,2020-02-07,02/07/2020,0.451527,0.141549,1.0,0.028,0.16,0.813,1
8,14,10/21/2020,ABT,Abbott Laboratories (ABT) CEO Robert Ford on Q...,Abbott Laboratories (NYSE:ABT) Q3 2020 Earning...,"['Operator', ""Good morning and thank you for s...",/article/4380361-abbott-laboratories-abt-ceo-r...,2020-10-21,10/21/2020,0.472639,0.184756,1.0,0.011,0.157,0.832,1
9,16,7/16/2020,ABT,"Abbott Laboratories (ABT) CEO, Robert Ford on ...",Abbott Laboratories (NYSE:ABT) Q2 2020 Earning...,"['Operator', 'Good morning and thank you for s...",/article/4358872-abbott-laboratories-abt-ceo-r...,2020-07-16,07/16/2020,0.478001,0.146816,1.0,0.02,0.133,0.847,0


In [70]:
#load the price data for S&P 100 for 2020 only

price_list= pd.read_csv('combined_snp100_data.csv')
price_list.head(10)

Unnamed: 0.1,Unnamed: 0,Symbol,Name,Date,Adj_Close,Close,High,Low,Open,Volume
0,0,AAPL,Apple Inc.,2020-01-02,74.444603,75.087502,75.150002,73.797501,74.059998,135480400
1,1,AAPL,Apple Inc.,2020-01-03,73.72084,74.357498,75.144997,74.125,74.287498,146322800
2,2,AAPL,Apple Inc.,2020-01-06,74.308266,74.949997,74.989998,73.1875,73.447502,118387200
3,3,AAPL,Apple Inc.,2020-01-07,73.958794,74.597504,75.224998,74.370003,74.959999,108872000
4,4,AAPL,Apple Inc.,2020-01-08,75.148521,75.797501,76.110001,74.290001,74.290001,132079200
5,5,AAPL,Apple Inc.,2020-01-09,76.744728,77.407501,77.607498,76.550003,76.809998,170108400
6,6,AAPL,Apple Inc.,2020-01-10,76.918221,77.582497,78.167503,77.0625,77.650002,140644800
7,7,AAPL,Apple Inc.,2020-01-13,78.561531,79.239998,79.267502,77.787498,77.910004,121532000
8,8,AAPL,Apple Inc.,2020-01-14,77.500702,78.169998,79.392502,78.042503,79.175003,161954400
9,9,AAPL,Apple Inc.,2020-01-15,77.168564,77.834999,78.875,77.387497,77.962502,121923600


In [71]:
price_list.dtypes

Unnamed: 0      int64
Symbol         object
Name           object
Date           object
Adj_Close     float64
Close         float64
High          float64
Low           float64
Open          float64
Volume         object
dtype: object

In [72]:
#correct stock price date to correctly associate in the join
price_list['date_rev2']= pd.to_datetime(price_list['Date'])
price_list['stock_price_date']= price_list['date_rev2'].dt.strftime('%m/%d/%Y')

In [73]:
price_list.head(5)

Unnamed: 0.1,Unnamed: 0,Symbol,Name,Date,Adj_Close,Close,High,Low,Open,Volume,date_rev2,stock_price_date
0,0,AAPL,Apple Inc.,2020-01-02,74.444603,75.087502,75.150002,73.797501,74.059998,135480400,2020-01-02,01/02/2020
1,1,AAPL,Apple Inc.,2020-01-03,73.72084,74.357498,75.144997,74.125,74.287498,146322800,2020-01-03,01/03/2020
2,2,AAPL,Apple Inc.,2020-01-06,74.308266,74.949997,74.989998,73.1875,73.447502,118387200,2020-01-06,01/06/2020
3,3,AAPL,Apple Inc.,2020-01-07,73.958794,74.597504,75.224998,74.370003,74.959999,108872000,2020-01-07,01/07/2020
4,4,AAPL,Apple Inc.,2020-01-08,75.148521,75.797501,76.110001,74.290001,74.290001,132079200,2020-01-08,01/08/2020


In [74]:
merge_price_trans3=pd.merge(price_list, transcribed_list_org, how='outer', left_on=['Symbol', 'stock_price_date'],right_on=['Symbol', 'call_date'] )
merge_price_trans3.head(29)
#merge_price_trans.columns



Unnamed: 0,Unnamed: 0_x,Symbol,Name,Date,Adj_Close,Close,High,Low,Open,Volume,...,url,date_rev2_y,call_date,Subjectivity,Polarity,Compound,Negative,Positive,Neutral,Label
0,0,AAPL,Apple Inc.,2020-01-02,74.444603,75.087502,75.150002,73.797501,74.059998,135480400,...,,NaT,,,,,,,,
1,1,AAPL,Apple Inc.,2020-01-03,73.72084,74.357498,75.144997,74.125,74.287498,146322800,...,,NaT,,,,,,,,
2,2,AAPL,Apple Inc.,2020-01-06,74.308266,74.949997,74.989998,73.1875,73.447502,118387200,...,,NaT,,,,,,,,
3,3,AAPL,Apple Inc.,2020-01-07,73.958794,74.597504,75.224998,74.370003,74.959999,108872000,...,,NaT,,,,,,,,
4,4,AAPL,Apple Inc.,2020-01-08,75.148521,75.797501,76.110001,74.290001,74.290001,132079200,...,,NaT,,,,,,,,
5,5,AAPL,Apple Inc.,2020-01-09,76.744728,77.407501,77.607498,76.550003,76.809998,170108400,...,,NaT,,,,,,,,
6,6,AAPL,Apple Inc.,2020-01-10,76.918221,77.582497,78.167503,77.0625,77.650002,140644800,...,,NaT,,,,,,,,
7,7,AAPL,Apple Inc.,2020-01-13,78.561531,79.239998,79.267502,77.787498,77.910004,121532000,...,,NaT,,,,,,,,
8,8,AAPL,Apple Inc.,2020-01-14,77.500702,78.169998,79.392502,78.042503,79.175003,161954400,...,,NaT,,,,,,,,
9,9,AAPL,Apple Inc.,2020-01-15,77.168564,77.834999,78.875,77.387497,77.962502,121923600,...,,NaT,,,,,,,,


In [75]:
merge_price_trans4=merge_price_trans3.fillna('')
merge_price_trans4.head(29)
#merge_price_trans.columns


Unnamed: 0,Unnamed: 0_x,Symbol,Name,Date,Adj_Close,Close,High,Low,Open,Volume,...,url,date_rev2_y,call_date,Subjectivity,Polarity,Compound,Negative,Positive,Neutral,Label
0,0,AAPL,Apple Inc.,2020-01-02,74.444603,75.087502,75.150002,73.797501,74.059998,135480400,...,,,,,,,,,,
1,1,AAPL,Apple Inc.,2020-01-03,73.72084,74.357498,75.144997,74.125,74.287498,146322800,...,,,,,,,,,,
2,2,AAPL,Apple Inc.,2020-01-06,74.308266,74.949997,74.989998,73.1875,73.447502,118387200,...,,,,,,,,,,
3,3,AAPL,Apple Inc.,2020-01-07,73.958794,74.597504,75.224998,74.370003,74.959999,108872000,...,,,,,,,,,,
4,4,AAPL,Apple Inc.,2020-01-08,75.148521,75.797501,76.110001,74.290001,74.290001,132079200,...,,,,,,,,,,
5,5,AAPL,Apple Inc.,2020-01-09,76.744728,77.407501,77.607498,76.550003,76.809998,170108400,...,,,,,,,,,,
6,6,AAPL,Apple Inc.,2020-01-10,76.918221,77.582497,78.167503,77.0625,77.650002,140644800,...,,,,,,,,,,
7,7,AAPL,Apple Inc.,2020-01-13,78.561531,79.239998,79.267502,77.787498,77.910004,121532000,...,,,,,,,,,,
8,8,AAPL,Apple Inc.,2020-01-14,77.500702,78.169998,79.392502,78.042503,79.175003,161954400,...,,,,,,,,,,
9,9,AAPL,Apple Inc.,2020-01-15,77.168564,77.834999,78.875,77.387497,77.962502,121923600,...,,,,,,,,,,


In [76]:
merge_price_trans4.head(3)
merge_price_trans4.columns

Index(['Unnamed: 0_x', 'Symbol', 'Name', 'Date', 'Adj_Close', 'Close', 'High',
       'Low', 'Open', 'Volume', 'date_rev2_x', 'stock_price_date',
       'Unnamed: 0_y', 'date', 'title', 'transcript',
       'transcript_prep_remarks', 'url', 'date_rev2_y', 'call_date',
       'Subjectivity', 'Polarity', 'Compound', 'Negative', 'Positive',
       'Neutral', 'Label'],
      dtype='object')

In [77]:
merge_price_trans5= merge_price_trans4.drop(['Unnamed: 0_x','Date', 'date_rev2_x', 'date', 'Unnamed: 0_y', 'date_rev2_y' ],1)
merge_price_trans5.head(3)
#print(merge_price_trans5.columns)

Unnamed: 0,Symbol,Name,Adj_Close,Close,High,Low,Open,Volume,stock_price_date,title,...,transcript_prep_remarks,url,call_date,Subjectivity,Polarity,Compound,Negative,Positive,Neutral,Label
0,AAPL,Apple Inc.,74.444603,75.087502,75.150002,73.797501,74.059998,135480400,01/02/2020,,...,,,,,,,,,,
1,AAPL,Apple Inc.,73.72084,74.357498,75.144997,74.125,74.287498,146322800,01/03/2020,,...,,,,,,,,,,
2,AAPL,Apple Inc.,74.308266,74.949997,74.989998,73.1875,73.447502,118387200,01/06/2020,,...,,,,,,,,,,


In [78]:
merge_price_trans6=merge_price_trans5[['Symbol','Name', 'stock_price_date','Adj_Close', 'Close', 'High', 'Low', 'Open',
       'Volume','call_date','title', 'transcript', 'transcript_prep_remarks', 'url','Subjectivity', 'Polarity', 'Compound', 'Negative', 'Positive', 'Neutral', 'Label']]
merge_price_trans6.head(3)


Unnamed: 0,Symbol,Name,stock_price_date,Adj_Close,Close,High,Low,Open,Volume,call_date,...,transcript,transcript_prep_remarks,url,Subjectivity,Polarity,Compound,Negative,Positive,Neutral,Label
0,AAPL,Apple Inc.,01/02/2020,74.444603,75.087502,75.150002,73.797501,74.059998,135480400,,...,,,,,,,,,,
1,AAPL,Apple Inc.,01/03/2020,73.72084,74.357498,75.144997,74.125,74.287498,146322800,,...,,,,,,,,,,
2,AAPL,Apple Inc.,01/06/2020,74.308266,74.949997,74.989998,73.1875,73.447502,118387200,,...,,,,,,,,,,


In [79]:
merge_price_trans6.to_csv('full_trans_sents_score_wstockprice.csv')

In [80]:
sent_test2= pd.read_csv('full_trans_sents_score_wstockprice.csv')
sent_test2.head(28)

Unnamed: 0.1,Unnamed: 0,Symbol,Name,stock_price_date,Adj_Close,Close,High,Low,Open,Volume,...,transcript,transcript_prep_remarks,url,Subjectivity,Polarity,Compound,Negative,Positive,Neutral,Label
0,0,AAPL,Apple Inc.,01/02/2020,74.444603,75.087502,75.150002,73.797501,74.059998,135480400,...,,,,,,,,,,
1,1,AAPL,Apple Inc.,01/03/2020,73.72084,74.357498,75.144997,74.125,74.287498,146322800,...,,,,,,,,,,
2,2,AAPL,Apple Inc.,01/06/2020,74.308266,74.949997,74.989998,73.1875,73.447502,118387200,...,,,,,,,,,,
3,3,AAPL,Apple Inc.,01/07/2020,73.958794,74.597504,75.224998,74.370003,74.959999,108872000,...,,,,,,,,,,
4,4,AAPL,Apple Inc.,01/08/2020,75.148521,75.797501,76.110001,74.290001,74.290001,132079200,...,,,,,,,,,,
5,5,AAPL,Apple Inc.,01/09/2020,76.744728,77.407501,77.607498,76.550003,76.809998,170108400,...,,,,,,,,,,
6,6,AAPL,Apple Inc.,01/10/2020,76.918221,77.582497,78.167503,77.0625,77.650002,140644800,...,,,,,,,,,,
7,7,AAPL,Apple Inc.,01/13/2020,78.561531,79.239998,79.267502,77.787498,77.910004,121532000,...,,,,,,,,,,
8,8,AAPL,Apple Inc.,01/14/2020,77.500702,78.169998,79.392502,78.042503,79.175003,161954400,...,,,,,,,,,,
9,9,AAPL,Apple Inc.,01/15/2020,77.168564,77.834999,78.875,77.387497,77.962502,121923600,...,,,,,,,,,,


In [81]:
merge_price_trans6.head(3)

Unnamed: 0,Symbol,Name,stock_price_date,Adj_Close,Close,High,Low,Open,Volume,call_date,...,transcript,transcript_prep_remarks,url,Subjectivity,Polarity,Compound,Negative,Positive,Neutral,Label
0,AAPL,Apple Inc.,01/02/2020,74.444603,75.087502,75.150002,73.797501,74.059998,135480400,,...,,,,,,,,,,
1,AAPL,Apple Inc.,01/03/2020,73.72084,74.357498,75.144997,74.125,74.287498,146322800,,...,,,,,,,,,,
2,AAPL,Apple Inc.,01/06/2020,74.308266,74.949997,74.989998,73.1875,73.447502,118387200,,...,,,,,,,,,,


In [82]:
transcribed_keep2=merge_price_trans6.drop(['Name','title','transcript',
       'transcript_prep_remarks', 'url', 'Volume', 'Compound'],1)
transcribed_keep2.head(3)

Unnamed: 0,Symbol,stock_price_date,Adj_Close,Close,High,Low,Open,call_date,Subjectivity,Polarity,Negative,Positive,Neutral,Label
0,AAPL,01/02/2020,74.444603,75.087502,75.150002,73.797501,74.059998,,,,,,,
1,AAPL,01/03/2020,73.72084,74.357498,75.144997,74.125,74.287498,,,,,,,
2,AAPL,01/06/2020,74.308266,74.949997,74.989998,73.1875,73.447502,,,,,,,


In [83]:
transcribed_keep2.dtypes

Symbol               object
stock_price_date     object
Adj_Close           float64
Close               float64
High                float64
Low                 float64
Open                float64
call_date            object
Subjectivity         object
Polarity             object
Negative             object
Positive             object
Neutral              object
Label                object
dtype: object

In [84]:
transcribed_keep2

Unnamed: 0,Symbol,stock_price_date,Adj_Close,Close,High,Low,Open,call_date,Subjectivity,Polarity,Negative,Positive,Neutral,Label
0,AAPL,01/02/2020,74.444603,75.087502,75.150002,73.797501,74.059998,,,,,,,
1,AAPL,01/03/2020,73.720840,74.357498,75.144997,74.125000,74.287498,,,,,,,
2,AAPL,01/06/2020,74.308266,74.949997,74.989998,73.187500,73.447502,,,,,,,
3,AAPL,01/07/2020,73.958794,74.597504,75.224998,74.370003,74.959999,,,,,,,
4,AAPL,01/08/2020,75.148521,75.797501,76.110001,74.290001,74.290001,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25549,XOM,12/24/2020,41.599998,41.599998,41.849998,41.380001,41.650002,,,,,,,
25550,XOM,12/28/2020,41.740002,41.740002,42.549999,41.520000,41.689999,,,,,,,
25551,XOM,12/29/2020,41.270000,41.270000,42.119999,41.200001,42.040001,,,,,,,
25552,XOM,12/30/2020,41.599998,41.599998,42.419998,41.270000,41.330002,,,,,,,


In [85]:
transcribed_keep3=transcribed_keep2.replace(r'^\s*$', np.nan, regex=True)
transcribed_keep3

Unnamed: 0,Symbol,stock_price_date,Adj_Close,Close,High,Low,Open,call_date,Subjectivity,Polarity,Negative,Positive,Neutral,Label
0,AAPL,01/02/2020,74.444603,75.087502,75.150002,73.797501,74.059998,,,,,,,
1,AAPL,01/03/2020,73.720840,74.357498,75.144997,74.125000,74.287498,,,,,,,
2,AAPL,01/06/2020,74.308266,74.949997,74.989998,73.187500,73.447502,,,,,,,
3,AAPL,01/07/2020,73.958794,74.597504,75.224998,74.370003,74.959999,,,,,,,
4,AAPL,01/08/2020,75.148521,75.797501,76.110001,74.290001,74.290001,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25549,XOM,12/24/2020,41.599998,41.599998,41.849998,41.380001,41.650002,,,,,,,
25550,XOM,12/28/2020,41.740002,41.740002,42.549999,41.520000,41.689999,,,,,,,
25551,XOM,12/29/2020,41.270000,41.270000,42.119999,41.200001,42.040001,,,,,,,
25552,XOM,12/30/2020,41.599998,41.599998,42.419998,41.270000,41.330002,,,,,,,


In [86]:
transcribed_keep4=transcribed_keep3.dropna()
transcribed_keep4

Unnamed: 0,Symbol,stock_price_date,Adj_Close,Close,High,Low,Open,call_date,Subjectivity,Polarity,Negative,Positive,Neutral,Label
17,AAPL,01/28/2020,78.742477,79.422501,79.599998,78.047501,78.150002,01/28/2020,0.447640,0.177873,0.022,0.154,0.825,1.0
82,AAPL,04/30/2020,72.993935,73.449997,73.632500,72.087502,72.489998,04/30/2020,0.432260,0.171578,0.024,0.146,0.830,0.0
145,AAPL,07/30/2020,95.851517,96.190002,96.297501,93.767502,94.187500,07/30/2020,0.459360,0.165622,0.023,0.170,0.807,1.0
209,AAPL,10/29/2020,115.121384,115.320000,116.930000,112.199997,112.370003,10/29/2020,0.475568,0.166736,0.032,0.151,0.818,1.0
278,ABBV,02/07/2020,87.605736,92.290001,92.980003,88.879997,89.739998,02/07/2020,0.451527,0.141549,0.028,0.160,0.813,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25270,WMT,11/17/2020,148.825989,149.369995,153.240005,149.179993,150.500000,11/17/2020,0.423592,0.191343,0.020,0.193,0.787,1.0
25321,XOM,01/31/2020,57.525826,62.119999,63.200001,61.860001,63.200001,01/31/2020,0.461173,0.141075,0.025,0.156,0.818,1.0
25384,XOM,05/01/2020,40.523052,43.139999,46.389999,43.000000,45.630001,05/01/2020,0.450083,0.178662,0.029,0.150,0.821,0.0
25447,XOM,07/31/2020,40.293766,42.080002,42.169998,40.910000,41.160000,07/31/2020,0.435841,0.124895,0.040,0.121,0.839,0.0


In [87]:
transcribed_keep4.columns

Index(['Symbol', 'stock_price_date', 'Adj_Close', 'Close', 'High', 'Low',
       'Open', 'call_date', 'Subjectivity', 'Polarity', 'Negative', 'Positive',
       'Neutral', 'Label'],
      dtype='object')

In [88]:
#create featured data set
X= transcribed_keep4
X=np.array(X.drop(['Label', 'Symbol', 'stock_price_date','call_date'], 1))

#create target data set
y=np.array(transcribed_keep4['Label'])

In [89]:
X

array([[7.87424774e+01, 7.94225006e+01, 7.95999985e+01, ...,
        2.20000000e-02, 1.54000000e-01, 8.25000000e-01],
       [7.29939346e+01, 7.34499969e+01, 7.36324997e+01, ...,
        2.40000000e-02, 1.46000000e-01, 8.30000000e-01],
       [9.58515167e+01, 9.61900024e+01, 9.62975006e+01, ...,
        2.30000000e-02, 1.70000000e-01, 8.07000000e-01],
       ...,
       [4.05230522e+01, 4.31399994e+01, 4.63899994e+01, ...,
        2.90000000e-02, 1.50000000e-01, 8.21000000e-01],
       [4.02937660e+01, 4.20800018e+01, 4.21699982e+01, ...,
        4.00000000e-02, 1.21000000e-01, 8.39000000e-01],
       [3.18515339e+01, 3.26199989e+01, 3.31399994e+01, ...,
        3.60000000e-02, 1.21000000e-01, 8.43000000e-01]])

In [90]:
y

array([1., 0., 1., 1., 1., 0., 0., 0., 1., 0., 0., 1., 1., 1., 0., 1., 1.,
       1., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
       0., 0., 0., 0., 1., 1., 1., 1., 1., 0., 0., 0., 1., 1., 1., 1., 1.,
       1., 0., 1., 1., 0., 0., 0., 1., 1., 0., 1., 0., 0., 0., 0., 1., 1.,
       1., 1., 1., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 1., 1., 1., 0., 0., 0., 1., 1., 0., 0., 1., 0., 1., 0.,
       0., 0., 1., 1., 0., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 1., 0., 1., 0., 1., 1., 1.,
       0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 0., 0., 1., 1., 1., 0., 1., 1., 1., 0., 1., 0., 0., 0.,
       0., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1., 0., 0., 1., 1., 1., 0.,
       1., 1., 1., 0., 1., 0., 0., 0., 0., 1., 0., 1., 1., 1., 0., 0., 0.,
       1., 0., 1., 1., 1.

In [91]:
#split the data into 80% training and 20% testing data sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y,test_size=0.2, random_state=0)

In [92]:
#create and train the model
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
model=LinearDiscriminantAnalysis().fit(x_train, y_train)

In [93]:
#get the models predictions and classifications
predictions= model.predict(x_test)
predictions

array([0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 1., 1.,
       1., 1., 0., 0., 1., 1., 1., 1., 1., 0., 1., 0., 0., 0., 1., 0., 1.,
       0., 0., 1., 1., 1., 0., 1., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1.,
       1., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0.,
       1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0.])

In [94]:
#show the models metrics of 1 to positive values over 0.10 and 0 is under 0.10

#A classification report is one that details our model's accuracy among 3 metrics: precision, recall, and f1 score.

#Precision is the ratio of correctly predicted positive observations to the total PREDICTED positive observations.
#In other words, of the rows that our model predicted to be positive, what percentage actually were positive?

#Recall is the ratio of correctly predicted positive observations to the total ACTUAL positive observations.
#In other words, of the rows that were actually positive, what percentage did I predict to be positive?

#F1 Score is the harmonic average of precision and recall and can be used as a measure of model performance for classification
#An ideal F1 score is close to 1.0

from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         0.0       0.92      1.00      0.96        44
         1.0       1.00      0.89      0.94        37

    accuracy                           0.95        81
   macro avg       0.96      0.95      0.95        81
weighted avg       0.95      0.95      0.95        81

