# Apple, Inc. (AAPL) - Random Forest Classification Models

---------

###  Overview: 
- 1) [Importing the Data](#Importing)
- 2) [Data Preprocessing](#DPP)
- 3) [Split Datasets](#Splitting)
- 4) Classification Models:
    - 4a. [Random Forest Classification](#RFC)
    - 4b. [Bagging Classification](#Bagging)
    - 4c. [GridSearching](#Gridsearch)

--------


## Importing Libraries:

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

from datetime import datetime

from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.model_selection import TimeSeriesSplit
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

import sys
sys.path.append('..')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
%load_ext autoreload
%autoreload 2

-----

## Company Name

In [2]:
company_name = 'Apple'

-------
<a class="anchor" id="Importing"></a>

# Importing the Data

### Importing the Raw Dataframe:

In [3]:
def file_importer(company_name, file_name):
    company_name=company_name
    df = pd.read_csv(f'data/{company_name}_{file_name}.csv')
    df['Date'] = pd.to_datetime(df.Date)
    df.set_index('Date', inplace=True)
    df.sort_index(inplace=True, ascending=True)
    return df

### Importing the Engineered Dataframe:

In [4]:
df = file_importer(company_name, 'wSEC_Inner')
df.head(3)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Ex_Dividend,Split_Ratio,Adj_Open,Adj_High,Adj_Low,...,PX14A6G,S-3,S-3ASR,S-4,S-8,SC 13D,SC 13G,SC TO-I,SD,UPLOAD
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1994-01-26,33.75,34.0,33.25,33.5,1480400.0,0.0,1.0,1.057482,1.065315,1.041815,...,0,0,0,0,0,0,0,0,0,0
1994-01-26,33.75,34.0,33.25,33.5,1480400.0,0.0,1.0,1.057482,1.065315,1.041815,...,0,0,0,0,0,0,0,0,0,0
1994-02-10,36.25,37.5,36.0,36.5,2696700.0,0.0,1.0,1.139548,1.178843,1.131689,...,0,0,0,0,0,0,1,0,0,0


--------
<a class="anchor" id="DPP"></a>

# Data Preprocessing:


### Shifting the Dates for the Engineered Dataframe:


In [5]:
from lib.helper import date_shifter

In [6]:
df_shifted = date_shifter(df)
df_shifted.head(3)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Ex_Dividend,Split_Ratio,Adj_Open,Adj_High,Adj_Low,...,PX14A6G,S-3,S-3ASR,S-4,S-8,SC 13D,SC 13G,SC TO-I,SD,UPLOAD
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1994-01-26,33.75,34.0,33.25,33.5,1480400.0,0,1,1.05748,1.06532,1.04182,...,0,0,0,0,0,0,0,0,0,0
1994-02-10,33.75,34.0,33.25,33.5,1480400.0,0,1,1.05748,1.06532,1.04182,...,0,0,0,0,0,0,0,0,0,0
1994-02-17,36.25,37.5,36.0,36.5,2696700.0,0,1,1.13955,1.17884,1.13169,...,0,0,0,0,0,0,1,0,0,0


### Setting the Label:

In [7]:
df_shifted['Target'] = df_shifted.Adj_Close_Diff.apply(lambda x: str(1) if x >= 0 else str(0))

### Dropping the Continuous Data and Keeping the Categorical:

In [8]:
new_df = df_shifted.loc[:, 'document_type':'Target']

### Converting All Values into Integers:

In [9]:
new_df = new_df.apply(pd.to_numeric, errors='ignore')

In [10]:
new_df.tail(3)

Unnamed: 0_level_0,document_type,10-K,10-K405,10-Q,424B2,424B3,424B5,8-A12B,8-K,CERTNYS,...,S-3,S-3ASR,S-4,S-8,SC 13D,SC 13G,SC TO-I,SD,UPLOAD,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-02-12,10-Q,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2018-02-14,SC 13G,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
2018-03-07,8-K,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1


--------
<a class="anchor" id="Splitting"></a>

# Importing the Training and Test Set:

In [11]:
def datasets_importer(company_name, file_name):
    company_name=company_name
    df = pd.read_csv(f'../stocks/data/modeling_data/{company_name}_{file_name}.csv')
    df['Date'] = pd.to_datetime(df.Date)
    df.set_index('Date', inplace=True)
    df.sort_index(inplace=True, ascending=True)
    return df

-----
### Importing the Train Set:

In [12]:
X_train = datasets_importer(company_name, 'SEC_X_Train')
X_train.drop('document_type', 1, inplace=True)
X_train.head(3)

Unnamed: 0_level_0,10-K,10-K405,10-Q,424B2,424B3,424B5,8-A12B,8-K,CERTNYS,CORRESP,...,PX14A6G,S-3,S-3ASR,S-4,S-8,SC 13D,SC 13G,SC TO-I,SD,UPLOAD
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1994-01-26,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1994-02-10,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1994-02-17,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


-------
### Importing the Test Data:

In [13]:
X_test = datasets_importer(company_name, 'SEC_X_Test')
X_test.drop('document_type', 1, inplace=True)
X_test.head(3)

Unnamed: 0_level_0,10-K,10-K405,10-Q,424B2,424B3,424B5,8-A12B,8-K,CERTNYS,CORRESP,...,PX14A6G,S-3,S-3ASR,S-4,S-8,SC 13D,SC 13G,SC TO-I,SD,UPLOAD
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-01-06,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2017-01-06,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2017-01-19,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


---------

## Setting up the Target/Labels:

In [14]:
y_train = new_df[X_train.index[0] : X_train.index[-1]].Target.values

In [15]:
y_test = new_df[X_test.index[0] : X_test.index[-1]].Target.values

-----
<a class="anchor" id="RFC"></a>


# Random Forest Classification Model

### Setting up the Random Forest (RF) Classification:

In [16]:
rf = RandomForestClassifier(n_estimators=100, criterion='entropy', 
                            max_depth=15, min_samples_leaf=3, bootstrap=True, 
                            n_jobs=3, random_state=42, class_weight='balanced_subsample')

### Fitting the Data with the RF Model:

In [17]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced_subsample',
            criterion='entropy', max_depth=15, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=3,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=3, oob_score=False, random_state=42,
            verbose=0, warm_start=False)

### Scoring the Training Set Using the RF Model:

In [18]:
rf.score(X_train, y_train)

0.5777351247600768

### Scoring the Test Set using the RF Model:

In [19]:
rf.score(X_test, y_test)

0.46551724137931033

### Inspecting the Average Prediction:

In [20]:
y_test.mean()

0.603448275862069

In [21]:
rf.predict(X_test).mean()

0.5172413793103449

-------
<a class="anchor" id="Bagging"></a>

## Bagging Classification Model using Random Forest:

In [22]:
bc = BaggingClassifier(base_estimator=rf, n_estimators=100, 
                   max_features=1.0, n_jobs=3, random_state=42)

### Fitting the Data with the Bagging Classification:

In [23]:
bc.fit(X_train, y_train)

BaggingClassifier(base_estimator=RandomForestClassifier(bootstrap=True, class_weight='balanced_subsample',
            criterion='entropy', max_depth=15, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=3,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=3, oob_score=False, random_state=42,
            verbose=0, warm_start=False),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=100, n_jobs=3, oob_score=False,
         random_state=42, verbose=0, warm_start=False)

### Scoring the Training Set:

In [24]:
bc.score(X_train, y_train)

0.5758157389635317

### Scoring the Test Set:

In [25]:
bc.score(X_test, y_test)

0.43103448275862066

### Inspecting the Average Prediction:

In [26]:
y_test.mean()

0.603448275862069

In [27]:
bc.predict(X_test).mean()

0.3793103448275862

-----
<a class="anchor" id="Gridsearch"></a>


# Grid Searching

## Setting up the Random Forest Pipeline:

In [163]:
pipe = Pipeline([
    ('pca', PCA(random_state=42)),
    ('rf', RandomForestClassifier(random_state=42))
])

### Setting the Parameters for the Random Forest Model:

In [164]:
# Number of trees in random forest
n_estimators = [int(x) for x in range(2, 16, 2)]

# Maximum number of levels in tree
max_depth = [int(x) for x in range(6, 18, 2)]

# Minimum number of samples required to split a node
min_samples_split = [int(x) for x in range(2, 12, 2)]

# Minimum number of samples required at each leaf node
min_samples_leaf = [int(x) for x in range(2, 10, 3)]

# Method of selecting samples for training each tree
bootstrap = False

# PCA
n_components = [int(x) for x in range(2, 10, 3)]

criterion = ['entropy']

In [165]:
params = {'pca__n_components': n_components,
          'rf__criterion' : criterion,
           'rf__n_estimators': n_estimators,
           'rf__max_depth': max_depth,
           'rf__min_samples_split': min_samples_split,
           'rf__min_samples_leaf': min_samples_leaf
         }
print(params)

{'pca__n_components': [2, 5, 8], 'rf__criterion': ['entropy'], 'rf__n_estimators': [2, 4, 6, 8, 10, 12, 14], 'rf__max_depth': [6, 8, 10, 12, 14, 16], 'rf__min_samples_split': [2, 4, 6, 8, 10], 'rf__min_samples_leaf': [2, 5, 8]}


### Setting up a Custom Cross Validation for Sequential Data:

In [166]:
time_cv = TimeSeriesSplit(n_splits=5).split(X_train)

### GridSearching the Model:

In [167]:
rf_search = GridSearchCV(pipe, params, n_jobs=3, cv=time_cv)

### Fitting the Training Set using GridSearch:

In [168]:
rf_search.fit(X_train, y_train)

GridSearchCV(cv=<generator object TimeSeriesSplit.split at 0x11a7335c8>,
       error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=42,
  svd_solver='auto', tol=0.0, whiten=False)), ('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            ...stimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=3,
       param_grid={'pca__n_components': [2, 5, 8], 'rf__criterion': ['entropy'], 'rf__n_estimators': [2, 4, 6, 8, 10, 12, 14], 'rf__max_depth': [6, 8, 10, 12, 14, 16], 'rf__min_samples_split': [2, 4, 6, 8, 10], 'rf__min_samples_leaf': [2, 5, 8]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

### Scoring on the Training Set:

In [169]:
rf_search.score(X_train, y_train)

0.54510556621881

### Scoring the Test Set:

In [170]:
rf_search.score(X_test, y_test)

0.5172413793103449

### Looking at the Best Parameters:

In [171]:
rf_search.best_estimator_.named_steps['rf'].get_params()

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'entropy',
 'max_depth': 6,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 8,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 2,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [172]:
feature_importance = pd.DataFrame(rf_search.best_estimator_.named_steps['rf']\
                                  .feature_importances_, index=X_train.columns[:8])\
                                  .sort_values(0, ascending=False).sort_values(0,ascending=False)
feature_importance

Unnamed: 0,0
10-Q,0.239136
424B3,0.180925
8-A12B,0.177197
10-K405,0.123888
10-K,0.119019
424B5,0.110845
424B2,0.04899
8-K,0.0


---------

## Setting up the Bagging Classification Pipeline:

In [179]:
pipe2 = Pipeline([
#     ('pca', PCA(random_state=42)),
    ('br', BaggingClassifier(random_state=42))
])

### Setting the Parameters for the Bagging Classification Model:

In [180]:
# Number of trees in random forest
n_estimators = [int(x) for x in range(5, 100, 20)]

# Maximum number of levels in tree
max_features = [x for x in np.linspace(.10, 1, 5)]

n_components = [int(x) for x in range(2, 10, 3)]

criterion = ['entropy']

In [181]:
params2 = {
#     'br__base_estimator' : rf,
    'br__n_estimators' : n_estimators,
    'br__max_features' : max_features,   
}
print(params2)

{'br__n_estimators': [5, 25, 45, 65, 85], 'br__max_features': [0.1, 0.325, 0.55, 0.775, 1.0]}


### Setting up a Custom Cross Validation for Sequential Data:

In [182]:
time_cv2 = TimeSeriesSplit(n_splits=4).split(X_train)

### GridSearching the Bagging Classification Model:

In [183]:
grid = GridSearchCV(pipe2, params2, n_jobs=3, cv=time_cv2)

### Fitting the Training Set using GridSearch:

In [184]:
grid.fit(X_train, y_train)

GridSearchCV(cv=<generator object TimeSeriesSplit.split at 0x11af84b48>,
       error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('br', BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=10, n_jobs=1, oob_score=False, random_state=42,
         verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=3,
       param_grid={'br__n_estimators': [5, 25, 45, 65, 85], 'br__max_features': [0.1, 0.325, 0.55, 0.775, 1.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

### Scoring on the Training Set:

In [185]:
grid.score(X_train, y_train)

0.5834932821497121

### Scoring on the Test Set:

In [186]:
grid.score(X_test, y_test)

0.43103448275862066

In [188]:
grid.best_estimator_.named_steps['br'].get_params()

{'base_estimator': None,
 'bootstrap': True,
 'bootstrap_features': False,
 'max_features': 1.0,
 'max_samples': 1.0,
 'n_estimators': 45,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}