# Apple, Inc. (AAPL) - Preparing the Data for Classification Models

---------

###  Overview: 
- 1) Importing Data:
    - [Importing the Stock Data](#Stock)
    - [Importing the SEC Data](#SEC)
- 2) [Feature Engineering](#FE)
- 3) [Importing & Merging the Engineered Stock Data and SEC](#PCT)
- 4) [Filling the Null Values](#Nulls)

--------


## Importing Libraries:

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

from datetime import datetime

import sys
sys.path.append('..')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
%load_ext autoreload
%autoreload 2

-----

## Company Name

In [2]:
company_name = 'Apple'

-------
<a class="anchor" id="Importing"></a>

# Importing the Data


## Creating a Function to Import the Stock Data:

In [2]:
def file_importer(company_name, file_name):
    """
    Imports a dataframe depending on the file name.
    
    Parameters
    ------------
    company_name : str or var
        Passes the company name as a str or variable.
    file_name : str
        Passes the name of a file as a string.
    """
    company_name=company_name
    # Reading the CSV file, converts the date into time and places it as an index.
    df = pd.read_csv(f'data/{company_name}_{file_name}.csv')
    df['Date'] = pd.to_datetime(df.Date)
    df.set_index('Date', inplace=True)
    df.sort_index(inplace=True, ascending=True)
    return df

----
<a class="anchor" id="Stock"></a>

## Importing the Raw Stock Dataframe:

In [4]:
df_raw = file_importer(company_name, 'Clean')
df_raw.head(3)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Ex_Dividend,Split_Ratio,Adj_Open,Adj_High,Adj_Low,Adj_Close,Adj_Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1980-12-12,28.75,28.87,28.75,28.75,2093900.0,0.0,1.0,0.422706,0.42447,0.422706,0.422706,117258400.0
1980-12-15,27.38,27.38,27.25,27.25,785200.0,0.0,1.0,0.402563,0.402563,0.400652,0.400652,43971200.0
1980-12-16,25.37,25.37,25.25,25.25,472000.0,0.0,1.0,0.37301,0.37301,0.371246,0.371246,26432000.0


------
<a class="anchor" id="SEC"></a>

## Importing the SEC Data:

In [5]:
def sec_file_importer(company_name):
    """
    Imports the SEC dataframe depending on the file name.
    
    Parameter
    -----------
    company_name : str or var
        Passes the company's name as a string or variable.
    
    """
    company_name=company_name
    # Reading the CSV file, renames the date, converts the date into time and places it as an index.
    df_sec = pd.read_csv(f'../sec/data/{company_name}_SEC_clean.csv')
    df_sec.rename({'date':'Date'}, axis=1, inplace=True)
    df_sec['Date'] = pd.to_datetime(df_sec.Date)
    df_sec.set_index('Date', inplace=True)
    df_sec.sort_index(inplace=True, ascending=True)
    return df_sec

### Importing the SEC Clean (and Engineered) Data:

In [6]:
sec_df = sec_file_importer(company_name)
sec_df.head(3)

Unnamed: 0_level_0,document_type,10-K,10-K405,10-Q,424B2,424B3,424B5,8-A12B,8-K,CERTNYS,...,PX14A6G,S-3,S-3ASR,S-4,S-8,SC 13D,SC 13G,SC TO-I,SD,UPLOAD
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1994-01-26,424B5,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1994-01-26,10-Q,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1994-02-10,SC 13G,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


------
<a class="anchor" id="FE"></a>

# Feature Engineering:

In [7]:
def engineer_data(dataframe):
    """ 
    Returns a data frame with engineered moving averages and percent changes & differences from day-to-day bases.
   
    Parameters
    ---------------------------------------------------------------------------------------------------------
    dataframe : pd.DataFrame()
        Passes a dataframe.
    """
    
    temp_df = dataframe.copy()
    
    # Setting the Date as the Index and sorting in acsending order:
    temp_df.set_index('Date', inplace=True)
    temp_df.sort_index(inplace=True, ascending=True)
    
    # Calculating the Simple Moving Average:
    short_SMA = temp_df.rolling(window=12).mean().copy()
    mid_SMA = temp_df.rolling(window=26).mean().copy()
    long_SMA = temp_df.rolling(window=85).mean().copy()

    # Calculating the Exponential Moving Average:
    short_EMA = temp_df.ewm(span=12, adjust=False).mean().copy()
    mid_EMA = temp_df.ewm(span=26, adjust=False).mean().copy()
    long_EMA = temp_df.ewm(span=85, adjust=False).mean().copy()
    
    
    # Calcualting the Percent Change from day to day.
    pctChange = temp_df.pct_change()
    # Calculating the Difference between a day and the day before.
    diffChange = temp_df.diff()
    
    temp_df = pd.merge(temp_df, pctChange, left_index=True, right_index=True, suffixes=['','_PCT_Change'])
    temp_df = pd.merge(temp_df, diffChange, left_index=True, right_index=True, suffixes=['','_Diff'])
    
    # Merging the Simple Moving Avverages data frames with the original Data frame:
    temp_df = pd.merge(temp_df, short_SMA, left_index=True, right_index=True, suffixes=['','_Short_SMA'])
    temp_df = pd.merge(temp_df, mid_SMA, left_index=True, right_index=True, suffixes=['','_Mid_SMA'])
    temp_df = pd.merge(temp_df, long_SMA, left_index=True, right_index=True, suffixes=['','_Long_SMA'])
    
    # Merging the Exponential Moving Average data frames with the Original data frame:
    temp_df = pd.merge(temp_df, short_EMA, left_index=True, right_index=True, suffixes=['','_Short_EMA'])
    temp_df = pd.merge(temp_df, mid_EMA, left_index=True, right_index=True, suffixes=['','_Mid_EMA'])
    temp_df = pd.merge(temp_df, long_EMA, left_index=True, right_index=True, suffixes=['','_Long_EMA'])
    
    return temp_df

### Resetting the Index to pass through the Engineer Data Function:

In [8]:
df_raw = df_raw.reset_index()

### Engineering and Taking a Look:

In [9]:
engineered_df = engineer_data(df_raw)
engineered_df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Ex_Dividend,Split_Ratio,Adj_Open,Adj_High,Adj_Low,...,Low_Long_EMA,Close_Long_EMA,Volume_Long_EMA,Ex_Dividend_Long_EMA,Split_Ratio_Long_EMA,Adj_Open_Long_EMA,Adj_High_Long_EMA,Adj_Low_Long_EMA,Adj_Close_Long_EMA,Adj_Volume_Long_EMA
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1980-12-12,28.75,28.87,28.75,28.75,2093900.0,0.0,1.0,0.422706,0.42447,0.422706,...,28.75,28.75,2093900.0,0.0,1.0,0.422706,0.42447,0.422706,0.422706,117258400.0
1980-12-15,27.38,27.38,27.25,27.25,785200.0,0.0,1.0,0.402563,0.402563,0.400652,...,28.715116,28.715116,2063465.0,0.0,1.0,0.422237,0.423961,0.422193,0.422193,115554000.0
1980-12-16,25.37,25.37,25.25,25.25,472000.0,0.0,1.0,0.37301,0.37301,0.371246,...,28.634532,28.634532,2026454.0,0.0,1.0,0.421093,0.422776,0.421008,0.421008,113481400.0
1980-12-17,25.87,26.0,25.87,25.87,385900.0,0.0,1.0,0.380362,0.382273,0.380362,...,28.570241,28.570241,1988302.0,0.0,1.0,0.420145,0.421834,0.420063,0.420063,111344900.0
1980-12-18,26.63,26.75,26.63,26.63,327900.0,0.0,1.0,0.391536,0.3933,0.391536,...,28.525119,28.525119,1949688.0,0.0,1.0,0.41948,0.42117,0.4194,0.4194,109182500.0


-------

### Saving the Engineered Data Frame:

In [1]:
# engineered_df.to_csv(f'../stocks/data/{company_name}_Engineered_pctChange.csv', index=True) 

-----

----------
<a class="anchor" id="PCT"></a>


## Importing & Merging the Engineered w/PCT Change Stock Data and SEC:

In [11]:
def merged_data_importer(company_name, how):
    """
    Returns a merged dataframe with the flexibility on how to merge. 
    
    Parameters
    -------------
    company_name : str or var
        Passes the company's name as a str or variable.
    
    how : str
        Passes a string on how to merge the data.
    """
    company_name = company_name
    # Reading the CSV containing the engineered stock data.
    stock = pd.read_csv(f'../stocks/data/{company_name}_Engineered_pctChange.csv')
    stock['Date'] = pd.to_datetime(stock.Date)
    stock.set_index('Date', inplace=True)
    # Reading the CSV data containing the SEC data. 
    sec_filings = pd.read_csv(f'../sec/data/{company_name}_SEC_clean.csv')
    sec_filings.rename({'date':'Date'}, axis=1, inplace=True)
    sec_filings['Date'] = pd.to_datetime(sec_filings.Date)
    sec_filings.set_index('Date', inplace=True)
    # Merging both dataframes.
    data = pd.merge(stock, sec_filings, on='Date', how=f'{how}')
    data.reset_index(level=0, inplace=True)
    return data

-------
### Taking a Look at the Inner Merged Dataframe:

In [12]:
df_inner_merged = merged_data_importer(company_name, 'inner')
df_inner_merged.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Ex_Dividend,Split_Ratio,Adj_Open,Adj_High,...,PX14A6G,S-3,S-3ASR,S-4,S-8,SC 13D,SC 13G,SC TO-I,SD,UPLOAD
0,1994-01-26,33.75,34.0,33.25,33.5,1480400.0,0.0,1.0,1.057482,1.065315,...,0,0,0,0,0,0,0,0,0,0
1,1994-01-26,33.75,34.0,33.25,33.5,1480400.0,0.0,1.0,1.057482,1.065315,...,0,0,0,0,0,0,0,0,0,0
2,1994-02-10,36.25,37.5,36.0,36.5,2696700.0,0.0,1.0,1.139548,1.178843,...,0,0,0,0,0,0,1,0,0,0
3,1994-02-17,37.25,37.88,36.25,37.0,1296000.0,0.0,1.0,1.170984,1.190788,...,0,0,0,0,0,0,1,0,0,0
4,1994-02-18,36.5,37.0,36.25,36.25,1331000.0,0.0,1.0,1.147407,1.163125,...,0,0,0,0,0,0,1,0,0,0


### Saving the Inner Merged Data: 

In [13]:
# df_inner_merged.to_csv(f'../stocks/data/{company_name}_wSEC_Inner.csv', index=False)

--------

### Taking a Look at the Outer Merged Dataframe:

In [14]:
df_outer_merged = merged_data_importer(company_name, 'outer')
df_outer_merged.head(3)

Unnamed: 0,Date,Open,High,Low,Close,Volume,Ex_Dividend,Split_Ratio,Adj_Open,Adj_High,...,PX14A6G,S-3,S-3ASR,S-4,S-8,SC 13D,SC 13G,SC TO-I,SD,UPLOAD
0,1980-12-12,28.75,28.87,28.75,28.75,2093900.0,0.0,1.0,0.422706,0.42447,...,,,,,,,,,,
1,1980-12-15,27.38,27.38,27.25,27.25,785200.0,0.0,1.0,0.402563,0.402563,...,,,,,,,,,,
2,1980-12-16,25.37,25.37,25.25,25.25,472000.0,0.0,1.0,0.37301,0.37301,...,,,,,,,,,,


-----
<a class="anchor" id="Nulls"></a>

# Filling the Null Values with Zeros:

In [15]:
df_outer_merged.fillna(value=0, inplace=True)
df_outer_merged.head(3)

Unnamed: 0,Date,Open,High,Low,Close,Volume,Ex_Dividend,Split_Ratio,Adj_Open,Adj_High,...,PX14A6G,S-3,S-3ASR,S-4,S-8,SC 13D,SC 13G,SC TO-I,SD,UPLOAD
0,1980-12-12,28.75,28.87,28.75,28.75,2093900.0,0.0,1.0,0.422706,0.42447,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1980-12-15,27.38,27.38,27.25,27.25,785200.0,0.0,1.0,0.402563,0.402563,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1980-12-16,25.37,25.37,25.25,25.25,472000.0,0.0,1.0,0.37301,0.37301,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Saving the Outer Merged Data:

In [16]:
# df_outer_merged.to_csv(f'../stocks/data/{company_name}_wSEC_Outer.csv', index=False)