# Apple, Inc. (AAPL)

## Importing Libraries:

In [107]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

from datetime import datetime

from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit

from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier


import sys
sys.path.append('..')

%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


-----

## Company Name

In [60]:
company_name = 'Apple'

## Importing the Data:

In [61]:
def data_reader(company_name):
    company_name=company_name
    df = pd.read_csv(f'data/{company_name}_Clean.csv')
    df['Date'] = pd.to_datetime(df.Date)
    df.set_index('Date', inplace=True)
    df.sort_index(inplace=True, ascending=True)
    return df

In [62]:
# def engineered_data_reader(company_name):
#     company_name=company_name
#     df = pd.read_csv(f'data/{company_name}_Engineered.csv')
#     df['Date'] = pd.to_datetime(df.Date)
#     df.set_index('Date', inplace=True)
#     df.sort_index(inplace=True, ascending=True)
#     return df

In [63]:
df = data_reader(company_name)
df.head(3)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Ex_Dividend,Split_Ratio,Adj_Open,Adj_High,Adj_Low,Adj_Close,Adj_Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1980-12-12,28.75,28.87,28.75,28.75,2093900.0,0.0,1.0,0.422706,0.42447,0.422706,0.422706,117258400.0
1980-12-15,27.38,27.38,27.25,27.25,785200.0,0.0,1.0,0.402563,0.402563,0.400652,0.400652,43971200.0
1980-12-16,25.37,25.37,25.25,25.25,472000.0,0.0,1.0,0.37301,0.37301,0.371246,0.371246,26432000.0


In [64]:
# df_engineered = engineered_data_reader(company_name)

In [65]:
# df_engineered.head(3)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Ex_Dividend,Split_Ratio,Adj_Open,Adj_High,Adj_Low,...,Low_Long_EMA,Close_Long_EMA,Volume_Long_EMA,Ex_Dividend_Long_EMA,Split_Ratio_Long_EMA,Adj_Open_Long_EMA,Adj_High_Long_EMA,Adj_Low_Long_EMA,Adj_Close_Long_EMA,Adj_Volume_Long_EMA
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1980-12-12,28.75,28.87,28.75,28.75,2093900.0,0.0,1.0,0.422706,0.42447,0.422706,...,28.75,28.75,2093900.0,0.0,1.0,0.422706,0.42447,0.422706,0.422706,117258400.0
1980-12-15,27.38,27.38,27.25,27.25,785200.0,0.0,1.0,0.402563,0.402563,0.400652,...,28.715116,28.715116,2063465.0,0.0,1.0,0.422237,0.423961,0.422193,0.422193,115554000.0
1980-12-16,25.37,25.37,25.25,25.25,472000.0,0.0,1.0,0.37301,0.37301,0.371246,...,28.634532,28.634532,2026454.0,0.0,1.0,0.421093,0.422776,0.421008,0.421008,113481400.0


In [66]:
# def reader(company_name):
#     company_name = company_name
    
#     stock = pd.read_csv(f'data/{company_name}_Engineered.csv')
#     stock['Date'] = pd.to_datetime(stock.Date)
#     stock.set_index('Date', inplace=True)
    
#     sec_filings = pd.read_csv(f'../sec/data/{company_name}_SEC_clean.csv')

#     sec_filings.rename({'date':'Date'}, axis=1, inplace=True)
#     sec_filings['Date'] = pd.to_datetime(sec_filings.Date)
#     sec_filings.set_index('Date', inplace=True)

#     data = pd.merge(stock, sec_filings, on='Date', how='inner')
#     data.reset_index(level=0, inplace=True)
#     return data

In [67]:
# df = reader(company_name)

In [68]:
# df.head()

-----

# Splitting the Data a Training and Testing Set

## Creating a Function for the Training Set with Feature Engineering:

In [69]:
def shift_dates(df):
    shifted_df = pd.DataFrame(df[:-1].values, index = df[1:].index, columns=df.columns)
    return shifted_df

In [70]:
temp_df = df.copy()
temp_df = shift_dates(temp_df)
temp_df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Ex_Dividend,Split_Ratio,Adj_Open,Adj_High,Adj_Low,Adj_Close,Adj_Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1980-12-15,28.75,28.87,28.75,28.75,2093900.0,0.0,1.0,0.422706,0.42447,0.422706,0.422706,117258400.0
1980-12-16,27.38,27.38,27.25,27.25,785200.0,0.0,1.0,0.402563,0.402563,0.400652,0.400652,43971200.0
1980-12-17,25.37,25.37,25.25,25.25,472000.0,0.0,1.0,0.37301,0.37301,0.371246,0.371246,26432000.0
1980-12-18,25.87,26.0,25.87,25.87,385900.0,0.0,1.0,0.380362,0.382273,0.380362,0.380362,21610400.0
1980-12-19,26.63,26.75,26.63,26.63,327900.0,0.0,1.0,0.391536,0.3933,0.391536,0.391536,18362400.0


In [39]:
temp_df1 = temp_df.diff()
temp_df1.head(3)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Ex_Dividend,Split_Ratio,Adj_Open,Adj_High,Adj_Low,...,Low_Long_EMA,Close_Long_EMA,Volume_Long_EMA,Ex_Dividend_Long_EMA,Split_Ratio_Long_EMA,Adj_Open_Long_EMA,Adj_High_Long_EMA,Adj_Low_Long_EMA,Adj_Close_Long_EMA,Adj_Volume_Long_EMA
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1980-12-15,,,,,,,,,,,...,,,,,,,,,,
1980-12-16,-1.37,-1.49,-1.5,-1.5,-1308700.0,0.0,0.0,-0.020143,-0.021907,-0.022054,...,-0.034884,-0.034884,-30434.883721,0.0,0.0,-0.000468,-0.000509,-0.000513,-0.000513,-1704353.0
1980-12-17,-2.01,-2.01,-2.0,-2.0,-313200.0,0.0,0.0,-0.029553,-0.029553,-0.029406,...,-0.080584,-0.080584,-37010.816658,0.0,0.0,-0.001145,-0.001185,-0.001185,-0.001185,-2072606.0


In [71]:
temp_df2 = temp_df.pct_change()
temp_df2.head(3)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Ex_Dividend,Split_Ratio,Adj_Open,Adj_High,Adj_Low,Adj_Close,Adj_Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1980-12-15,,,,,,,,,,,,
1980-12-16,-0.047652,-0.051611,-0.052174,-0.052174,-0.625006,,0.0,-0.047652,-0.051611,-0.052174,-0.052174,-0.625006
1980-12-17,-0.073411,-0.073411,-0.073394,-0.073394,-0.398879,,0.0,-0.073411,-0.073411,-0.073394,-0.073394,-0.398879


In [80]:
def engineer_this(dataframe):
    """ 
    Returns a data frame with engineered moving averages.
    Two types of moving averages are calculated: Simple & Exponential. 
    Each type of moving average is calculated with a short (12-Day), medium (26-Day), and long (85-Day) lag.
   
    Parameters
    ---------------------------------------------------------------------------------------------------------
    dataframe : pd.DataFrame()
        Passes a dataframe.
    """
    
    temp_df = dataframe.copy()
    
    # Setting the Date as the Index and sorting in acsending order:
    temp_df.set_index('Date', inplace=True)
    temp_df.sort_index(inplace=True, ascending=True)
    
    # Calculating the Simple Moving Average:
    short_SMA = temp_df.rolling(window=12).mean().copy()
    mid_SMA = temp_df.rolling(window=26).mean().copy()
    long_SMA = temp_df.rolling(window=85).mean().copy()

    # Calculating the Exponential Moving Average:
    short_EMA = temp_df.ewm(span=12, adjust=False).mean().copy()
    mid_EMA = temp_df.ewm(span=26, adjust=False).mean().copy()
    long_EMA = temp_df.ewm(span=85, adjust=False).mean().copy()
    
    
    # Calcualting the Percent Change Per day
    pctChange = temp_df.pct_change()
    
    diffChange = temp_df.diff()
    
    temp_df = pd.merge(temp_df, pctChange, left_index=True, right_index=True, suffixes=['','_PCT_Change'])
    temp_df = pd.merge(temp_df, diffChange, left_index=True, right_index=True, suffixes=['','_Diff'])


#     MACD = short_EMA - mid_EMA
#     signal_line = MACD.ewm(span=9, adjust=False).mean().copy()
    
#     temp_df = pd.merge(temp_df, MACD, left_index=True, right_index=True, suffixes=['','_MACD'])
#     temp_df = pd.merge(temp_df, signal_line, left_index=True, right_index=True, suffixes=['','_Signal_Line'])
    
    # Merging the Simple Moving Avverages data frames with the original Data frame:
    temp_df = pd.merge(temp_df, short_SMA, left_index=True, right_index=True, suffixes=['','_Short_SMA'])
    temp_df = pd.merge(temp_df, mid_SMA, left_index=True, right_index=True, suffixes=['','_Mid_SMA'])
    temp_df = pd.merge(temp_df, long_SMA, left_index=True, right_index=True, suffixes=['','_Long_SMA'])
    
    # Merging the Exponential Moving Average data frames with the Original data frame:
    temp_df = pd.merge(temp_df, short_EMA, left_index=True, right_index=True, suffixes=['','_Short_EMA'])
    temp_df = pd.merge(temp_df, mid_EMA, left_index=True, right_index=True, suffixes=['','_Mid_EMA'])
    temp_df = pd.merge(temp_df, long_EMA, left_index=True, right_index=True, suffixes=['','_Long_EMA'])
    
    return temp_df

In [81]:
df = df.reset_index()

In [84]:
engineered_df = engineer_this(df).head()

In [85]:
# engineered_df.to_csv(f'../stocks/data/{company_name}_Engineered_pctChange.csv', index=False)

In [None]:
# temp_df['Date'] = pd.to_datetime(df.Date)
# temp_df.set_index('Date', inplace=True)
# temp_df.sort_index(inplace=True, ascending=True)

## Taking a Look at the Time-Shifted Data Set:

In [10]:
df_shift.tail(3)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Ex_Dividend,Split_Ratio,Adj_Open,Adj_High,Adj_Low,...,Low_Long_EMA,Close_Long_EMA,Volume_Long_EMA,Ex_Dividend_Long_EMA,Split_Ratio_Long_EMA,Adj_Open_Long_EMA,Adj_High_Long_EMA,Adj_Low_Long_EMA,Adj_Close_Long_EMA,Adj_Volume_Long_EMA
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-03-23,170.0,172.68,168.6,168.845,41051076.0,0.0,1.0,170.0,172.68,168.6,...,169.691723,171.097703,33582580.0,0.000518,1.0,171.126419,172.657153,169.671634,171.077463,33582580.0
2018-03-26,168.39,169.92,164.94,164.94,40248954.0,0.0,1.0,168.39,169.92,164.94,...,169.581218,170.9545,33737610.0,0.000506,1.0,171.062781,172.593499,169.561596,170.934731,33737610.0
2018-03-27,168.07,173.1,166.44,172.77,36272617.0,0.0,1.0,168.07,173.1,166.44,...,169.508167,170.996721,33796570.0,0.000494,1.0,170.993182,172.605278,169.489,170.977412,33796570.0


## Split the Data to Predict 2017+

In [11]:
df_shift[:'2017-01-03'].tail()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Ex_Dividend,Split_Ratio,Adj_Open,Adj_High,Adj_Low,...,Low_Long_EMA,Close_Long_EMA,Volume_Long_EMA,Ex_Dividend_Long_EMA,Split_Ratio_Long_EMA,Adj_Open_Long_EMA,Adj_High_Long_EMA,Adj_Low_Long_EMA,Adj_Close_Long_EMA,Adj_Volume_Long_EMA
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-12-27,115.59,116.52,115.59,116.52,14249484.0,0.0,1.0,114.162295,115.080808,114.162295,...,110.425003,111.333832,33740190.0,0.00748,1.0,109.439013,110.369718,108.756434,109.651716,33740210.0
2016-12-28,116.52,117.8,116.49,117.26,18296855.0,0.0,1.0,115.080808,116.344998,115.051178,...,110.56605,111.47165,33381050.0,0.007306,1.0,109.570218,110.508678,108.902823,109.794971,33381060.0
2016-12-29,117.52,118.0166,116.2,116.76,20905892.0,0.0,1.0,116.068456,116.558923,114.76476,...,110.697072,111.594635,33090930.0,0.007136,1.0,109.72134,110.649382,109.039147,109.923409,33090940.0
2016-12-30,116.45,117.1095,116.4,116.73,15039519.0,0.0,1.0,115.011672,115.663027,114.96229,...,110.829698,111.714062,32671130.0,0.00697,1.0,109.844371,110.765978,109.176895,110.048172,32671140.0
2017-01-03,116.65,117.2,115.43,115.82,30586265.0,0.0,1.0,115.209202,115.752409,114.004271,...,110.936682,111.809549,32622640.0,0.006808,1.0,109.969134,110.881942,109.289159,110.149132,32622660.0


In [12]:
df_shift['2017-01-03':].head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Ex_Dividend,Split_Ratio,Adj_Open,Adj_High,Adj_Low,...,Low_Long_EMA,Close_Long_EMA,Volume_Long_EMA,Ex_Dividend_Long_EMA,Split_Ratio_Long_EMA,Adj_Open_Long_EMA,Adj_High_Long_EMA,Adj_Low_Long_EMA,Adj_Close_Long_EMA,Adj_Volume_Long_EMA
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-01-03,116.65,117.2,115.43,115.82,30586265.0,0.0,1.0,115.209202,115.752409,114.004271,...,110.936682,111.809549,32622640.0,0.006808,1.0,109.969134,110.881942,109.289159,110.149132,32622660.0
2017-01-04,115.8,116.33,114.76,116.15,28781865.0,0.0,1.0,114.369701,114.893155,113.342546,...,111.025596,111.91049,32533320.0,0.00665,1.0,110.071473,110.975226,109.383424,110.255324,32533340.0
2017-01-05,115.85,116.51,115.75,116.02,21118116.0,0.0,1.0,114.419083,115.070931,114.320318,...,111.135466,112.00606,32267850.0,0.006495,1.0,110.17258,111.070475,109.498236,110.35606,32267870.0
2017-01-06,115.92,116.8642,115.81,116.61,22193587.0,0.0,1.0,114.488219,115.420756,114.379577,...,111.244176,112.113128,32033570.0,0.006344,1.0,110.272944,111.171644,109.611755,110.468005,32033580.0
2017-01-09,116.78,118.16,116.47,117.91,31751900.0,0.0,1.0,115.337596,116.700551,115.031425,...,111.365707,112.247939,32027020.0,0.006197,1.0,110.390726,111.300223,109.737794,110.607206,32027030.0


In [13]:
X_train, X_test = df_shift[:'2016-12-30'], df_shift['2017-01-03':]

In [49]:
df_shift = df_shift.apply(lambda x: x.astype(int))
df_shift.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Ex_Dividend,Split_Ratio,Adj_Open,Adj_High,Adj_Low,...,Low_Long_EMA,Close_Long_EMA,Volume_Long_EMA,Ex_Dividend_Long_EMA,Split_Ratio_Long_EMA,Adj_Open_Long_EMA,Adj_High_Long_EMA,Adj_Low_Long_EMA,Adj_Close_Long_EMA,Adj_Volume_Long_EMA
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1981-04-15,27,28,27,27,29700,0,1,0,0,0,...,27,27,395955,0,1,0,0,0,0,22173534
1981-04-16,26,26,26,26,152000,0,1,0,0,0,...,27,27,390282,0,1,0,0,0,0,21855823
1981-04-20,25,25,25,25,106600,0,1,0,0,0,...,27,27,383685,0,1,0,0,0,0,21486376
1981-04-21,25,25,25,25,157800,0,1,0,0,0,...,27,27,378432,0,1,0,0,0,0,21192200
1981-04-22,27,27,27,27,127400,0,1,0,0,0,...,27,27,372594,0,1,0,0,0,0,20865275


## Computing Average Uniqueness: 

-----

# Normalizing the Data with a MinMaxScaler

## Instantiating the Scaler:

In [39]:
scaler = MinMaxScaler(feature_range=(-1, 1))

## Scaling the Training Set:

In [40]:
X_train_sc = scaler.fit_transform(X_train.values)

## Scaling the Testing Set:

In [41]:
X_test_sc = scaler.transform(X_test.values)

## Setting the y Training Set:

In [42]:
y_train = df[X_train.index[0]:X_train.index[-1]].Close.values

## Setting the y Testing Set

In [43]:
y_test = df[X_test.index[0]:X_test.index[-1]].Close.values

-----

# Random Forest Classification Model

## Setting up the Random Forest (RF) Classification:

In [50]:
rf = RandomForestClassifier(n_estimators=100, criterion='entropy', 
                            max_depth=15, min_samples_leaf=3, bootstrap=True, 
                            n_jobs=3, random_state=42, class_weight='balanced_subsample')

In [51]:
bc = BaggingClassifier(base_estimator=rf, n_estimators=100, max_samples=7.02, 
                   max_features=1.0, n_jobs=3, random_state=42)

### Fitting the Scaled Data with the RF Model:

In [52]:
bc.fit(X_train_sc, y_train)

ValueError: Unknown label type: 'continuous'

### Scoring on the Training Data:

In [None]:
bc.score(X_train_sc, y_train)

### Scoring on the Testing Data

In [None]:
bc.score(X_test_sc, y_test)

### Inspecting the Average Prediction:

In [None]:
y_test.mean()

In [None]:
bc.predict(X_test_sc).mean()

In [None]:
error.error

-----

# Grid Searching a Random Forest Regression Model:

In [None]:
from sklearn.model_selection import GridSearchCV

### Creating a Pipeline

In [None]:
pipe = Pipeline([
    ('rf', RandomForestRegressor())
])

### Setting up the Parameters:

In [None]:
np.linspace(40, 100, 5)

In [None]:
# Number of trees in random forest

n_estimators = [int(x) for x in np.linspace(start = 40, stop = 100, num = 5)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(2, 80, 2)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

In [None]:
params = {'rf__n_estimators': n_estimators,
           'rf__max_features': max_features,
           'rf__max_depth': max_depth,
           'rf__min_samples_split': min_samples_split,
           'rf__min_samples_leaf': min_samples_leaf,
           'rf__bootstrap': bootstrap}
print(params)

### Gridsearching the Parameters:

In [None]:
rf_search = GridSearchCV(pipe, params, n_jobs=3)

### Fitting the Scaled Data with the Model:

In [None]:
rf_search.fit(X_train_sc, y_train)

### Scoring the Training Data:

In [None]:
rf_search.score(X_train_sc, y_train)

### Scoring the Test Data

In [None]:
rf_search.score(X_test_sc, y_test)

In [None]:
plt.scatter(y_test, rf_search.predict(X_test_sc))
plt.xlabel('Actual')
plt.ylabel('Predicted')

- `'rf__n_estimators': [40, 55, 70, 85, 100],` => `n_estimators=10, `


- `'rf__max_features': ['auto', 'sqrt'],` => `max_features='auto',`


- `'rf__max_depth': [2, 80, None],` => -`max_depth=None,`


- `'rf__min_samples_split': [2, 5, 10],` => `min_samples_split=2,`


- `'rf__min_samples_leaf': [1, 2, 4],` => `min_samples_leaf=1,`


- `'rf__bootstrap': [True, False]` => `bootstrap=True,`

--------

## GridSearching a Random Forest with Weights:

In [None]:
pipe_w = Pipeline([
    ('pca', PCA()),
    ('rf', RandomForestRegressor())
])

### Setting the Params

In [None]:
# Number of trees in random forest
n_estimators_w = [x for x in range(6, 16, 2)]

# Number of features to consider at every split
max_features_w = ['auto', 'log2']

# Maximum number of levels in tree
max_depth_w = [x for x in range(1, 5)]
max_depth_w.append(None)

# Minimum number of samples required to split a node
min_samples_split_w = [x for x in range(1, 5)]

# Minimum number of samples required at each leaf node
min_samples_leaf_w = [x for x in range(1, 5)]

# Method of selecting samples for training each tree
bootstrap_w = [True, False]

pca_n_components= [x for x in range(2, 24, 4)]

pca_svd_solver = ['auto', 'full', 'arpack', 'randomized']

### Setting up the Parameters with PCA Weights:

In [None]:
params_w = {'rf__n_estimators': n_estimators_w,
          'rf__max_features': max_features_w,
          'rf__max_depth': max_depth_w,
          'rf__min_samples_split': min_samples_split_w,
          'rf__min_samples_leaf': min_samples_leaf_w,
          'rf__bootstrap': bootstrap_w}
print(params_w)

### Gridsearching the Parameters with PCA:

In [None]:
grid = GridSearchCV(pipe_w, params_w, n_jobs=3)

### Fitting the Scaled Data with a Weighted Model:

In [None]:
grid.fit(X_train_sc, y_train)

### Scoring the Training Data:

In [None]:
grid.score(X_train_sc, y_train)

### Scoring the Test Data:

In [None]:
grid.score(X_test_sc, y_test)

In [None]:
plt.scatter(y_test, grid.predict(X_test_sc))
plt.xlabel('Actual')
plt.ylabel('Predicted')

----

# Modeling

## Attempting a Simple Linear Regression Model:

In [None]:
# from sklearn.linear_model import LinearRegression

In [None]:
# lr = LinearRegression()

In [None]:
# lr.fit(X_train_sc, y_train)

In [None]:
# lr.score(X_train_sc, y_train)

In [None]:
# lr.score(X_test_sc, y_test)

In [None]:
# plt.scatter(y_test, lr.predict(X_test_sc))
# plt.xlabel('Actual')
# plt.ylabel('Predicted')

In [None]:
# coef_weights = pd.DataFrame(lr.coef_, index=X_train.columns, columns=['weight'])

In [None]:
# coef_weights.sort_values('weight').tail()

# Time Series Split

`TimeSeriesSplit(n_splits=3, max_train_size=None)`

A Time Series cross-validator providing both a train and test index to split time series data observed at fixed time intervals. During each split, the test indices must be higher (in time) than before; therefore, random shuffling is inappropriate.

A variation of K-Fold; in the $Kth$ split, the model returns the first $K$ folds as train set and the $(k+1)th$ fold as test set. However, unlike the standard cross-validation methods, successive training sets are supersets of those that come before them.

## Scaling the Data

### Fitting and Transforming the Training Set:

In [None]:
# X_train_scaled = scaler.fit_transform(X.values)

### Checking the Shape:

In [None]:
# X_train_scaled.shape

In [None]:
# X_train_scaled

### Transforming the Prediction Label: 

In [None]:
# X_test_transformed = scaler.transform(test.values)

### Checking the Shape:

In [None]:
# X_test_transformed.shape

In [None]:
# X_test_transformed

## Splitting the Data using TimeSeriesSplit:

In [None]:
# tss = TimeSeriesSplit()  

# for train_index, test_index in tss.split(X_train_sc):
#     print("TRAIN:", train_index, "TEST:", test_index)
#     X_train, X_test = X_train_sc[train_index], X_train_sc[test_index]
#     y_train, y_test = X_train_sc[train_index], X_train_sc[test_index]

### Inspecting the Shape:

In [None]:
# for train_index, test_index in tss.split(X_train_sc):
#     print("TRAIN:", train_index, "TEST:", test_index)
#     X_train, X_test = X_train_sc[train_index], X_train_sc[test_index]
#     y_train, y_test = X_train_sc[train_index], X_train_sc[test_index]

### Inspecting the Shape:

### Inspecting the Shape:

In [None]:
# print('Shapes: ', '\n'
#     'X_train: ', X_train.shape, '\n'
#      'X_test: ', X_test.shape, '\n'
#       '\n'
#      'y_train: ',y_train.shape, '\n'
#       'y_test: ', y_test.shape)