In [14]:
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn import metrics
import pandas as pd
from sklearn.linear_model import LinearRegression
from fbprophet import Prophet
import numpy as np

In [39]:
## load the data and get all information for one company ##
data_set = pd.read_csv('../csv_data/reuters_news.csv', low_memory=False)
data_set = data_set.loc[data_set['ticker'] == 'AAPL']

## Data preperation ##

# delete missing prices data
data_set = data_set[data_set['prices'].notna()]
data_set = data_set[data_set['polarity'].notna()]
data_set = data_set[data_set['subjectivity'].notna()]

# delete unneeded columns: 'title' & 'description'
data_set.drop(['ticker', 'title', 'description'], axis=1, inplace=True)

# convert "Nan" values to 0 (when there is no news in a specific day, 
# the polarity and subjectivity would be 0 or nutural)
data_set['polarity'] = data_set['polarity'].fillna(0)
data_set['subjectivity'] = data_set['subjectivity'].fillna(0)

#  combine rows with the same date (some days have multipe news, so, compine them and take the average)
data_set = data_set.groupby(['date'],as_index=False).agg({'polarity': 'mean', 'subjectivity': 'mean', 'prices': 'mean',})

# print
data_set


Unnamed: 0,date,polarity,subjectivity,prices
0,2016-04-27,0.115939,0.445576,22.75
1,2016-04-28,0.119129,0.479735,22.05
2,2017-04-03,0.158730,0.340079,34.12
3,2018-04-11,0.095607,0.383708,41.59
4,2018-04-12,0.175000,0.775000,42.00
...,...,...,...,...
536,2021-04-05,0.031355,0.344949,125.90
537,2021-04-07,0.096050,0.357864,127.90
538,2021-04-08,0.033534,0.398072,130.36
539,2021-04-12,0.040675,0.215278,131.24


In [40]:
########### facebook PROPHET model ###############
## Docs: https://facebook.github.io/prophet/docs/quick_start.html 

prophet = data_set
prophet.head()

#train model
m = Prophet(interval_width=0.95, daily_seasonality=True)
m.add_regressor('polarity')
m.add_regressor('subjectivity')
prophet = prophet.rename(columns={'date':'ds', 'prices': 'y'})
prophet.reset_index(drop=True)
model = m.fit(prophet)
#forcast
future = m.make_future_dataframe(periods=100,freq='D')
future['polarity'] = prophet['polarity']
future['subjectivity'] = prophet['subjectivity']
forecast = m.predict(future)
# forecast.head()

# plot
plot1 = m.plot(forecast)
plt2 = m.plot_components(forecast)

Initial log joint probability = -15.8642
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
      99       1465.49     0.0075905       219.474       0.373           1      119   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     199       1473.13   0.000175857       201.961   7.288e-07       0.001      285  LS failed, Hessian reset 
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     268       1474.68   0.000210267       221.141   5.108e-07       0.001      429  LS failed, Hessian reset 
     299       1475.36   7.72811e-05       64.0384      0.5309      0.5309      468   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     320       1475.47   8.78473e-05       104.289   5.463e-07       0.001      540  LS failed, Hessian reset 
     348       1475.56   4.24599e-05       80.8564    6.63e-07       0.001      609 

ValueError: Found NaN in column 'polarity'