# Preprocessing the Data for Classification Models

---------

###  Overview: 
- Importing Data:
    - [Importing the Stock Data](#Stock)
    - [Importing the SEC Data](#SEC)
- Preparing the Data.
    - [Shifting the Dates](#Shifting)
- [Splitting the Data](#Splitting)

--------


## Importing Libraries:

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

from datetime import datetime

import sys
sys.path.append('..')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
%load_ext autoreload
%autoreload 2

-----

## Company Name

In [2]:
company_name = 'Apple'

-------

<a class="anchor" id="Importing"></a>
# Importing the Data

### Importing the Raw Dataframe:

In [3]:
def file_importer(company_name, file_name):
    """
    Imports a dataframe according to the file name.
    
    Parameters
    ------------
    company_name : str or var
        Passes the company's name as a string or variable.
    
    file_name : str
        Passes a the file name as a string. 
    """
    company_name=company_name
    # Reading the CSV file according to the file name, converts the date into time and sets it as the index.
    df = pd.read_csv(f'data/{company_name}_{file_name}.csv')
    df['Date'] = pd.to_datetime(df.Date)
    df.set_index('Date', inplace=True)
    df.sort_index(inplace=True, ascending=True)
    return df

### Importing the Engineered Dataframe:

In [4]:
df = file_importer(company_name, 'wSEC_Inner')
df.head(3)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Ex_Dividend,Split_Ratio,Adj_Open,Adj_High,Adj_Low,...,PX14A6G,S-3,S-3ASR,S-4,S-8,SC 13D,SC 13G,SC TO-I,SD,UPLOAD
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1994-01-26,33.75,34.0,33.25,33.5,1480400.0,0.0,1.0,1.057482,1.065315,1.041815,...,0,0,0,0,0,0,0,0,0,0
1994-01-26,33.75,34.0,33.25,33.5,1480400.0,0.0,1.0,1.057482,1.065315,1.041815,...,0,0,0,0,0,0,0,0,0,0
1994-02-10,36.25,37.5,36.0,36.5,2696700.0,0.0,1.0,1.139548,1.178843,1.131689,...,0,0,0,0,0,0,1,0,0,0


--------
<a class="anchor" id="Shifting"></a>
# Preparing the Data:


### Shifting the Dates for the Engineered Dataframe:


In [5]:
from lib.helper import date_shifter

In [6]:
df_shifted = date_shifter(df)
df_shifted.head(3)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Ex_Dividend,Split_Ratio,Adj_Open,Adj_High,Adj_Low,...,PX14A6G,S-3,S-3ASR,S-4,S-8,SC 13D,SC 13G,SC TO-I,SD,UPLOAD
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1994-01-26,33.75,34.0,33.25,33.5,1480400.0,0,1,1.05748,1.06532,1.04182,...,0,0,0,0,0,0,0,0,0,0
1994-02-10,33.75,34.0,33.25,33.5,1480400.0,0,1,1.05748,1.06532,1.04182,...,0,0,0,0,0,0,0,0,0,0
1994-02-17,36.25,37.5,36.0,36.5,2696700.0,0,1,1.13955,1.17884,1.13169,...,0,0,0,0,0,0,1,0,0,0


### Setting the Label:

In [7]:
df_shifted['Target'] = df_shifted.Adj_Close_Diff.apply(lambda x: str(1) if x >= 0 else str(0))

### Dropping the Continuous Data and Keeping the Categorical:

In [8]:
new_df = df_shifted.loc[:, 'document_type':'Target']

### Converting All Values into Integers:

In [9]:
new_df = new_df.apply(pd.to_numeric, errors='ignore')

In [10]:
new_df.tail()

Unnamed: 0_level_0,document_type,10-K,10-K405,10-Q,424B2,424B3,424B5,8-A12B,8-K,CERTNYS,...,S-3,S-3ASR,S-4,S-8,SC 13D,SC 13G,SC TO-I,SD,UPLOAD,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-02-01,SC 13G,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2018-02-02,8-K,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
2018-02-12,10-Q,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2018-02-14,SC 13G,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
2018-03-07,8-K,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1


--------
<a class="anchor" id="Splitting"></a>
# Splitting the Data into a Training and Test Set:

In [11]:
from lib.helper import data_splitter

In [12]:
X_train, X_test = data_splitter(new_df, '2016-12-30', '2017-01-03')

### Taking a Look at the Train Set:

In [13]:
X_train.head(3)

Unnamed: 0_level_0,document_type,10-K,10-K405,10-Q,424B2,424B3,424B5,8-A12B,8-K,CERTNYS,...,S-3,S-3ASR,S-4,S-8,SC 13D,SC 13G,SC TO-I,SD,UPLOAD,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1994-01-26,424B5,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1994-02-10,10-Q,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1994-02-17,SC 13G,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1


### Dropping the Target Label from the Train Set:

In [14]:
X_train.drop(columns='Target', axis=1, inplace=True)
X_train.head(3)

Unnamed: 0_level_0,document_type,10-K,10-K405,10-Q,424B2,424B3,424B5,8-A12B,8-K,CERTNYS,...,PX14A6G,S-3,S-3ASR,S-4,S-8,SC 13D,SC 13G,SC TO-I,SD,UPLOAD
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1994-01-26,424B5,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1994-02-10,10-Q,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1994-02-17,SC 13G,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


### Taking a Look at the Test Set:

In [15]:
X_test.head()

Unnamed: 0_level_0,document_type,10-K,10-K405,10-Q,424B2,424B3,424B5,8-A12B,8-K,CERTNYS,...,S-3,S-3ASR,S-4,S-8,SC 13D,SC 13G,SC TO-I,SD,UPLOAD,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-01-06,8-K,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
2017-01-06,DEF 14A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2017-01-19,DEFA14A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2017-01-31,SC 13G,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2017-01-31,PX14A6G,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Dropping the Target Label from the Test Set:

In [16]:
X_test.drop(columns='Target', axis=1, inplace=True)
X_test.head(3)

Unnamed: 0_level_0,document_type,10-K,10-K405,10-Q,424B2,424B3,424B5,8-A12B,8-K,CERTNYS,...,PX14A6G,S-3,S-3ASR,S-4,S-8,SC 13D,SC 13G,SC TO-I,SD,UPLOAD
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-01-06,8-K,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2017-01-06,DEF 14A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2017-01-19,DEFA14A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
y_train = new_df[X_train.index[0] : X_train.index[-1]].Target.values

In [18]:
y_test = new_df[X_test.index[0] : X_test.index[-1]].Target.values

-------

### Saving the Data onto a CSV:

In [19]:
# Saving the X_train
X_train.to_csv(f'data/{company_name}_SEC_X_Train.csv', index=True)

In [21]:
# Saving the X_test
X_test.to_csv(f'data/{company_name}_SEC_X_Test.csv', index=True)