## Import dependecies
Import modules for

In [1]:
import os, inspect, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.offline as ply
import plotly.graph_objs as go
import plotly.tools as tls

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import (
    AdaBoostClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
    ExtraTreesClassifier
)
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    mean_squared_error,
    roc_curve,
    auc,
)
import matplotlib.pyplot as plt

from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima_model import ARIMA
from xgboost import XGBClassifier

In [3]:
# get project root directory
curr_dir = os.path.dirname(inspect.getabsfile(inspect.currentframe()))
root_dir = os.path.dirname(curr_dir)
sys.path.insert(0, 0)

## Load data
We load the stock datasets to a pandas dataframe:

In [4]:
# load data
fname = os.path.join(root_dir, "stockomen", "data", "int", "stock.csv")
df = pd.read_csv(fname, index_col=False)
df.head(5)

Unnamed: 0,Date,Subjectivity,Objectivity,Positive,Negative,Neutral,Open,High,Low,Close,Volume,Adj Close,Month,Quarter,Label
0,2008-08-08,75.0,25.0,18.75,56.25,25.0,11432.089844,11759.959961,11388.040039,11734.320312,212830000.0,11734.320312,8.0,3.0,0
1,2008-08-11,83.333333,16.666667,41.666667,41.666667,16.666667,11729.669922,11867.110352,11675.530273,11782.349609,183190000.0,11782.349609,8.0,3.0,1
2,2008-08-12,56.25,43.75,18.75,37.5,43.75,11781.700195,11782.349609,11601.519531,11642.469727,173590000.0,11642.469727,8.0,3.0,0
3,2008-08-13,38.461538,61.538462,15.384615,23.076923,61.538462,11632.80957,11633.780273,11453.339844,11532.959961,182550000.0,11532.959961,8.0,3.0,0
4,2008-08-14,45.454545,54.545455,36.363636,9.090909,54.545455,11532.070312,11718.280273,11450.889648,11615.929688,159790000.0,11615.929688,8.0,3.0,1


##  Fill missing values (the NaN values) with the column mean

In [5]:
nan_list = ['Subjectivity', 'Objectivity', 'Positive', 'Negative', 'Neutral']
for col in nan_list:
    df[col] = df[col].fillna(df[col].mean())

In [6]:
# Recheck the count
print(df.count())

Date            1989
Subjectivity    1989
Objectivity     1989
Positive        1989
Negative        1989
Neutral         1989
Open            1989
High            1989
Low             1989
Close           1989
Volume          1989
Adj Close       1989
Month           1989
Quarter         1989
Label           1989
dtype: int64


# Prepare train data

In [7]:
x = df.loc[:, ["Subjectivity", 
               "Objectivity",
               "Positive",
               "Negative",
               "Neutral",
               "Open",
               "High",
               "Low",
               "Close",
               "Volume",
               "Adj Close",
               "Month",
               "Quarter"]]
y = df.loc[:,'Label']
x.head()

Unnamed: 0,Subjectivity,Objectivity,Positive,Negative,Neutral,Open,High,Low,Close,Volume,Adj Close,Month,Quarter
0,75.0,25.0,18.75,56.25,25.0,11432.089844,11759.959961,11388.040039,11734.320312,212830000.0,11734.320312,8.0,3.0
1,83.333333,16.666667,41.666667,41.666667,16.666667,11729.669922,11867.110352,11675.530273,11782.349609,183190000.0,11782.349609,8.0,3.0
2,56.25,43.75,18.75,37.5,43.75,11781.700195,11782.349609,11601.519531,11642.469727,173590000.0,11642.469727,8.0,3.0
3,38.461538,61.538462,15.384615,23.076923,61.538462,11632.80957,11633.780273,11453.339844,11532.959961,182550000.0,11532.959961,8.0,3.0
4,45.454545,54.545455,36.363636,9.090909,54.545455,11532.070312,11718.280273,11450.889648,11615.929688,159790000.0,11615.929688,8.0,3.0


In [8]:
# set valid ratio wrt the number of train data
valid_size = 0.2
# set test ratio wrt to the number of data
test_size = 0.3
# rescale data
# scaler = StandardScaler().fit(x)
scaler = RobustScaler().fit(x)
x_scaled = scaler.transform(x)
n_x = len(x.index)
n_train = int(len(x.index) * 0.7)
x_train, x_test = x_scaled[0:n_train+1, :], x_scaled[n_train: n_x, :]
y_train, y_test = y[0:n_train+1], y[n_train: n_x]
print("data: ", x.shape, "labels: ", y.shape )
print("train data: ", x_train.shape, "train labels: ", y_train.shape )
print("test data: ", x_test.shape, "test labels: ", y_test.shape )

data:  (1989, 13) labels:  (1989,)
train data:  (1393, 13) train labels:  (1393,)
test data:  (597, 13) test labels:  (597,)


In [9]:
# plt.plot(X_train['Objectivity'])
# plt.plot([None for i in x_train['Objectivity']] + [x for x in x_test['Objectivity']])
# plt.show()

In [10]:
# set train params
num_folds = 10
scoring = 'accuracy'

In [11]:
# define training models
models = {}
models.update({'LR': LogisticRegression()})
models.update({'LDA': LinearDiscriminantAnalysis()})
models.update({'KNN': KNeighborsClassifier()})
models.update({'CART': DecisionTreeClassifier()})
models.update({'NB': GaussianNB()})
models.update({'SVM': SVC()})
models.update({'RF': RandomForestClassifier(n_estimators=50)})
models.update({'XGBoost': XGBClassifier()})
models.update({'LDA': LinearDiscriminantAnalysis()})

In [12]:
x_train.shape

(1393, 13)

In [13]:
# train models
results = []
names = []
for name, model in models.items():
    clf = model
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    accu_score = accuracy_score(y_test, y_pred)
    print(name + ": " + str(accu_score))

LR: 0.7520938023450586
LDA: 0.9447236180904522
KNN: 0.5175879396984925
CART: 0.5728643216080402
NB: 0.507537688442211





Variables are collinear.





SVM: 0.541038525963149
RF: 0.5628140703517588
XGBoost: 0.576214405360134


In [14]:
# Explore featurs

In [15]:
features = df.drop(['Label', "Date"],axis=1).columns.values


x, y = (list(x) for x in zip(*sorted(zip(models["XGBoost"].feature_importances_, features), 

                                     reverse = False)))
ply.init_notebook_mode(connected=True)

trace2 = go.Bar(
    x=x ,
    y=y,
    marker=dict(
        color=x,
        colorscale = 'Viridis',
        reversescale = True
    ),
    name='Feature importance for XGBoost',
    orientation='h',
)

layout = dict(
    title='Barplot of Feature importances for XGBoost',
     width = 1000, height = 1000,
    yaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=True,
#         domain=[0, 0.85],
    ))

fig1 = go.Figure(data=[trace2])
fig1['layout'].update(layout)
ply.iplot(fig1, filename='plots')

In [16]:
x, y = (list(x) for x in zip(*sorted(zip(models["RF"].feature_importances_, features), 
                                                            reverse = False)))
trace2 = go.Bar(
    x=x ,
    y=y,
    marker=dict(
        color=x,
        colorscale = 'Viridis',
        reversescale = True
    ),
    name='Feature importance for Random Forests',
    orientation='h',
)

layout = dict(
    title='Barplot of Feature importances for Random Forests',
     width = 1000, height = 1000,
    yaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=True,
#         domain=[0, 0.85],
    ))

fig1 = go.Figure(data=[trace2])
fig1['layout'].update(layout)
ply.iplot(fig1, filename='plots')