# Intro

This is the main notebook for the Capstone Project course

In [197]:
# Standard Libraries
import pandas as pd
import numpy as np
import os
import warnings
import json

# Plotting Libraries
import matplotlib.pyplot as plt
import seaborn as sns

# set Theme
plt.style.use('seaborn')
sns.set_theme(style="whitegrid")
warnings.filterwarnings('ignore')

In [198]:
# Set Paths
path = os.getcwd()
sourceDataPath = path +'\\SourceData\\'
exportPath = path +'\\Export\\'
trimmedCSVPath = path +'\\SourceData\\'
print(f"Source Data:",sourceDataPath)
print(f"Exports:",exportPath)
print(f"Trimmed CSV:",trimmedCSVPath)

Source Data: C:\Users\TheCu\OneDrive\Documents\Grad-School-Docs\CapstoneProject\Repo\SourceData\
Exports: C:\Users\TheCu\OneDrive\Documents\Grad-School-Docs\CapstoneProject\Repo\Export\
Trimmed CSV: C:\Users\TheCu\OneDrive\Documents\Grad-School-Docs\CapstoneProject\Repo\SourceData\


# Resuable Functions

In [199]:
# Create Large CSV Trim Function

def largeCSVTrim(largeCSVPath, CSVName, trimQuery):
    tempDF = pd.read_csv(largeCSVPath+CSVName, chunksize=10000)
    df = pd.concat((x.query(trimQuery) for x in tempDF), ignore_index=True)
    return df

In [200]:
# Takes a DF and Producecs a Report of Null Values
def nullAnalysis(df):
    row = []
    null_summaryDF = pd.DataFrame(columns = ["Variable","Observations","Nulls","Null_Per","Num_Unique","Type"])
    for column in df:
        var = column
        countOfObs = len(df[var])
        countOfNull = df[var].isnull().sum()
        perOfNull = round((100 * countOfNull) / countOfObs,3)
        numUnique = df[var].nunique()
        type  = df[var].dtypes
        # Append to Summary DF
        row = [var, countOfObs,countOfNull,perOfNull,numUnique,type]
        df_length = len(null_summaryDF)
        null_summaryDF.loc[df_length] = row
    # Get Summary Stats
    countofVars = len(null_summaryDF)
    countofNotNull = (null_summaryDF['Nulls'] == 0).sum()
    countofNull = (null_summaryDF['Nulls'] != 0).sum()
    print(f"Count of Variables:",countofVars)
    print(f"Count of Variables without nulls:",countofNotNull)
    print(f"Count of Variables with null :",countofNull)
    # Format Summary
    summaryDF = null_summaryDF[null_summaryDF['Nulls'] != 0].sort_values(by=['Nulls'],ascending=False)
    return summaryDF

In [201]:
# Dropping Missing Values over 30% -- Will print out a table that shows what variables are dropped
def dropCols_nullThresh(df,thresh):
    thresh = 1-thresh
    column_names_before = df.columns.to_list()
    before = df.shape[1]
    df = df.dropna(thresh=df.shape[0]*thresh,how='all',axis=1)
    after = df.shape[1]
    column_names_after = df.columns.to_list()
    columns_dropped = []
    for i in column_names_before:
        if i not in column_names_after:
            columns_dropped.append(i)
    total_dropped = before - after
    df = df
    print(f"dropped: {total_dropped}")
    print(f"columns that where dropped where: {columns_dropped}")
    return df

In [202]:
# Dropping Missing Values over 30% -- Will print out a table that shows what variables are dropped
def dropCols_uniqueThresh(df,thresh):
    thresh = thresh
    column_names_before = df.columns.to_list()
    before = df.shape[1]
    df = df.drop(columns=df.columns[df.nunique()==1])
    after = df.shape[1]
    column_names_after = df.columns.to_list()
    columns_dropped = []
    for i in column_names_before:
        if i not in column_names_after:
            columns_dropped.append(i)
    total_dropped = before - after
    df = df
    print(f"dropped: {total_dropped}")
    print(f"columns that where dropped where: {columns_dropped}")
    return df

# Source Data

## Prep -- Trim Large CSV
(Note: Creates trimmed version of CSV's for import.
Only need to run this once, and then comment out to save time)

In [203]:
# Trim Securities_Full
largeCSVPath = 'C:/Users/TheCu/OneDrive/Documents/Grad-School-Docs/CapstoneProject/SourceData_Orig/'
CSVName = 'Securities_Full.csv'
trimQuery = "tic == 'SAM'"

print("Starting...")
# Securities_SAM = largeCSVTrim(largeCSVPath,CSVName,trimQuery) # Get Trimmed Data
# Securities_SAM.to_csv(trimmedCSVPath+'Securities_SAM.csv') # Export Trimmed Data

#View Trimmed Dataframe
print("Securities_SAM:")
# Securities_SAM.sample(3)

Starting...
Securities_SAM:


In [204]:
# Trim Fundamentals_Full
largeCSVPath = 'C:/Users/TheCu/OneDrive/Documents/Grad-School-Docs/CapstoneProject/SourceData_Orig/'
CSVName = 'Fundamentals_Full.csv'
trimQuery = "gsector == 30"

print("Starting...")
#Fundamentals = largeCSVTrim(largeCSVPath,CSVName,trimQuery) # Get Trimmed Data
#Fundamentals.to_csv(trimmedCSVPath+'Fundamentals_30.csv') # Export Trimmed Data

#View Trimmed Dataframe
print("Fundamentals_SAM:")
#Fundamentals.sample(3)

Starting...
Fundamentals_SAM:


In [205]:
# Trim Stocks_DS
largeCSVPath = 'C:/Users/TheCu/OneDrive/Documents/Grad-School-Docs/CapstoneProject/SourceData_Orig/'
CSVName = 'Stocks_DS.csv'
trimQuery = "tic == 'SAM'"

print("Starting...")
# Stocks_SAM = largeCSVTrim(largeCSVPath,CSVName,trimQuery) # Get Trimmed Data
# Stocks_SAM.to_csv(trimmedCSVPath+'Stocks_SAM.csv') # Export Trimmed Data

#View Trimmed Dataframe
print("Stocks_SAM:")
# Stocks_SAM.sample(3)

Starting...
Stocks_SAM:


## Get Source Data
(Note: Creates trimmed version of CSV's for import.
Only need to run this once, and then comment out to save time)

In [206]:
# Get Fundamentals
file = 'Fundamentals_30.csv'
fundamentals = pd.read_csv(sourceDataPath+file,skipinitialspace = True)
fundamentals.head()

Unnamed: 0,gvkey,datadate,fyear,indfmt,consol,popsrc,datafmt,tic,conm,acctchg,...,priusa,sic,spcindcd,spcseccd,spcsrc,state,stko,weburl,dldte,ipodate
0,1659,12/31/2010,2010,INDL,C,D,SUMM_STD,ANDE,ANDERSONS INC,,...,1,5150,112.0,970.0,A-,OH,0,www.andersonsinc.com,,
1,1659,12/31/2010,2010,INDL,C,D,STD,ANDE,ANDERSONS INC,,...,1,5150,112.0,970.0,A-,OH,0,www.andersonsinc.com,,
2,1659,12/31/2011,2011,INDL,C,D,SUMM_STD,ANDE,ANDERSONS INC,,...,1,5150,112.0,970.0,A-,OH,0,www.andersonsinc.com,,
3,1659,12/31/2011,2011,INDL,C,D,STD,ANDE,ANDERSONS INC,,...,1,5150,112.0,970.0,A-,OH,0,www.andersonsinc.com,,
4,1659,12/31/2012,2012,INDL,C,D,SUMM_STD,ANDE,ANDERSONS INC,,...,1,5150,112.0,970.0,A-,OH,0,www.andersonsinc.com,,


In [207]:
# Get Securities

In [208]:
# Get Stocks

In [209]:
# Get Ratings


# Data Clean-Up

## Fundamentals

In [210]:
# Initial Column Reduce -- Drop Obvious -- Nulls
# Run Initial -- Null Report
fundamentals_nullReport = nullAnalysis(fundamentals)
fundamentals_nullReport

Count of Variables: 1768
Count of Variables without nulls: 35
Count of Variables with null : 1733


Unnamed: 0,Variable,Observations,Nulls,Null_Per,Num_Unique,Type
880,xstfo,282,282,100.000,0,float64
1342,iaeqmi_dc,282,282,100.000,0,float64
731,tstkme,282,282,100.000,0,float64
1355,idilc_dc,282,282,100.000,0,float64
1354,idilb_dc,282,282,100.000,0,float64
...,...,...,...,...,...,...
356,ibmii,282,1,0.355,171,float64
92,at,282,1,0.355,161,float64
673,sale,282,1,0.355,179,float64
717,teq,282,1,0.355,153,float64


In [211]:
# Initial Column Reduce -- Drop Obvious -- Nulls
# Initial Null Drop -- More than 75% -- Will Re-Run w/ lower threshold after rows combined
fundamentals = dropCols_nullThresh(fundamentals,0.75)
fundamentals

dropped: 1348
columns that where dropped where: ['acctchg', 'acqmeth', 'adrr', 'bspr', 'compst', 'curuscn', 'ogm', 'stalt', 'udpl', 'pdate', 'acco', 'acoxar', 'acqao', 'acqcshi', 'acqgdwl', 'acqic', 'acqintan', 'acqinvt', 'acqlntal', 'acqniintc', 'acqppe', 'acqsc', 'adpac', 'aedi', 'afudcc', 'afudci', 'amc', 'amdc', 'amgw', 'apb', 'apc', 'apofs', 'aqa', 'aqd', 'aqeps', 'aqi', 'aqp', 'aqs', 'arb', 'arc', 'arce', 'arced', 'arceeps', 'artfs', 'autxr', 'balr', 'banlr', 'bast', 'bastr', 'batr', 'bcef', 'bclr', 'bcltbl', 'bcnlr', 'bcrbl', 'bct', 'bctbl', 'bctr', 'bltbl', 'ca', 'capr1', 'capr2', 'capr3', 'cb', 'cbi', 'cdpac', 'cdvc', 'cfbd', 'cfere', 'cfo', 'cfpdo', 'cga', 'cgri', 'cgti', 'cgui', 'chs', 'clfc', 'clfx', 'clg', 'clis', 'cll', 'cllc', 'clo', 'clrll', 'clt', 'cmp', 'cnltbl', 'cpcbl', 'cpdoi', 'cpnli', 'cppbl', 'cprei', 'crv', 'crvnli', 'cshrc', 'cshrp', 'cshrso', 'cshrt', 'cshrw', 'dbi', 'dcs', 'depc', 'dfpac', 'dfs', 'dfxa', 'dltsub', 'dpacb', 'dpacc', 'dpacli', 'dpacls', 'dpacm

Unnamed: 0,gvkey,datadate,fyear,indfmt,consol,popsrc,datafmt,tic,conm,acctstd,...,phone,priusa,sic,spcindcd,spcseccd,spcsrc,state,stko,weburl,ipodate
0,1659,12/31/2010,2010,INDL,C,D,SUMM_STD,ANDE,ANDERSONS INC,,...,419-893-5050,1,5150,112.0,970.0,A-,OH,0,www.andersonsinc.com,
1,1659,12/31/2010,2010,INDL,C,D,STD,ANDE,ANDERSONS INC,DS,...,419-893-5050,1,5150,112.0,970.0,A-,OH,0,www.andersonsinc.com,
2,1659,12/31/2011,2011,INDL,C,D,SUMM_STD,ANDE,ANDERSONS INC,,...,419-893-5050,1,5150,112.0,970.0,A-,OH,0,www.andersonsinc.com,
3,1659,12/31/2011,2011,INDL,C,D,STD,ANDE,ANDERSONS INC,DS,...,419-893-5050,1,5150,112.0,970.0,A-,OH,0,www.andersonsinc.com,
4,1659,12/31/2012,2012,INDL,C,D,SUMM_STD,ANDE,ANDERSONS INC,,...,419-893-5050,1,5150,112.0,970.0,A-,OH,0,www.andersonsinc.com,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277,66073,12/31/2010,2010,INDL,C,D,STD,PHLI,PACIFICHEALTH LABORATORIES,DS,...,732-739-2900,1,2833,280.0,905.0,C,NJ,3,www.pacifichealthlabs.com,12/19/1997
278,66073,12/31/2011,2011,INDL,C,D,SUMM_STD,PHLI,PACIFICHEALTH LABORATORIES,,...,732-739-2900,1,2833,280.0,905.0,C,NJ,3,www.pacifichealthlabs.com,12/19/1997
279,277487,6/30/2011,2011,INDL,C,D,STD,LND,BRASILAGRO CIA BRAS DE PROP,DI,...,55 11 3035 5350,90,100,,,,,0,www.brasil-agro.com,5/2/2006
280,277487,6/30/2012,2012,INDL,C,D,STD,LND,BRASILAGRO CIA BRAS DE PROP,DI,...,55 11 3035 5350,90,100,,,,,0,www.brasil-agro.com,5/2/2006


In [212]:
# Initial Column Reduce -- Drop Obvious -- Nulls
# Re-Run Null Report
fundamentals_nullReport = nullAnalysis(fundamentals)
fundamentals_nullReport

Count of Variables: 420
Count of Variables without nulls: 35
Count of Variables with null : 385


Unnamed: 0,Variable,Observations,Nulls,Null_Per,Num_Unique,Type
218,optfvgr,282,208,73.759,73,float64
289,tfvce,282,208,73.759,2,float64
380,oprepsx_fn,282,206,73.050,3,object
225,optprcex,282,203,71.986,78,float64
233,pifo,282,203,71.986,79,float64
...,...,...,...,...,...,...
273,reuna,282,1,0.355,155,float64
157,ibmii,282,1,0.355,171,float64
276,seq,282,1,0.355,155,float64
275,sale,282,1,0.355,179,float64


In [213]:
# Initial Column Reduce -- Drop Obvious -- Cols w/ 1 Value

fundamentals.nunique()

gvkey       39
datadate    35
fyear        5
indfmt       1
consol       1
            ..
spcsrc       8
state       15
stko         3
weburl      39
ipodate     12
Length: 420, dtype: int64

In [214]:
fundamentals['indfmt'].value_counts()

INDL    282
Name: indfmt, dtype: int64

In [215]:
fundamentals = dropCols_uniqueThresh(fundamentals,1)
fundamentals



dropped: 42
columns that where dropped where: ['indfmt', 'consol', 'popsrc', 'curcd', 'final', 'ltcm', 'pddur', 'scf', 'upd', 'acchg', 'aldo', 'aocisecgl', 'drlt', 'esopdlt', 'esopnr', 'esopr', 'esopt', 'itcb', 'mib', 'rdip', 'rdipa', 'rdipd', 'rdipeps', 'tstkp', 'xi', 'xintopt', 'xoptd', 'xopteps', 'rank', 'cogs_fn', 'pncad_fn', 'pncaeps_fn', 'pnca_fn', 'pncwia_fn', 'pncwid_fn', 'pncwieps_fn', 'rdipa_fn', 'rdipd_fn', 'rdipeps_fn', 'spce_fn', 'stkco_fn', 'gsector']


Unnamed: 0,gvkey,datadate,fyear,datafmt,tic,conm,acctstd,ajex,ajp,curncd,...,phone,priusa,sic,spcindcd,spcseccd,spcsrc,state,stko,weburl,ipodate
0,1659,12/31/2010,2010,SUMM_STD,ANDE,ANDERSONS INC,,,,,...,419-893-5050,1,5150,112.0,970.0,A-,OH,0,www.andersonsinc.com,
1,1659,12/31/2010,2010,STD,ANDE,ANDERSONS INC,DS,1.5,1.5,USD,...,419-893-5050,1,5150,112.0,970.0,A-,OH,0,www.andersonsinc.com,
2,1659,12/31/2011,2011,SUMM_STD,ANDE,ANDERSONS INC,,,,,...,419-893-5050,1,5150,112.0,970.0,A-,OH,0,www.andersonsinc.com,
3,1659,12/31/2011,2011,STD,ANDE,ANDERSONS INC,DS,1.5,1.5,USD,...,419-893-5050,1,5150,112.0,970.0,A-,OH,0,www.andersonsinc.com,
4,1659,12/31/2012,2012,SUMM_STD,ANDE,ANDERSONS INC,,,,,...,419-893-5050,1,5150,112.0,970.0,A-,OH,0,www.andersonsinc.com,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277,66073,12/31/2010,2010,STD,PHLI,PACIFICHEALTH LABORATORIES,DS,1.0,1.0,USD,...,732-739-2900,1,2833,280.0,905.0,C,NJ,3,www.pacifichealthlabs.com,12/19/1997
278,66073,12/31/2011,2011,SUMM_STD,PHLI,PACIFICHEALTH LABORATORIES,,,,,...,732-739-2900,1,2833,280.0,905.0,C,NJ,3,www.pacifichealthlabs.com,12/19/1997
279,277487,6/30/2011,2011,STD,LND,BRASILAGRO CIA BRAS DE PROP,DI,1.0,1.0,BRR,...,55 11 3035 5350,90,100,,,,,0,www.brasil-agro.com,5/2/2006
280,277487,6/30/2012,2012,STD,LND,BRASILAGRO CIA BRAS DE PROP,DI,1.0,1.0,BRR,...,55 11 3035 5350,90,100,,,,,0,www.brasil-agro.com,5/2/2006
