# Intro:

This notebook is the second in the series for the Capstone Project.  "Capstone_Data-Prep" covers pulling in several
data sources, trimming to the needed observations/variables, joining together, and producing the source data or this
notebook.

In [117]:
# Standard Libraries
import pandas as pd
import numpy as np
import os
import warnings
import json
import math

# Plotting Libraries
import matplotlib.pyplot as plt
import seaborn as sns

#sklearn
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

#sklearn missing imputer:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import ExtraTreesRegressor


# set Theme
plt.style.use('seaborn')
sns.set_theme(style="whitegrid")
warnings.filterwarnings('ignore')

In [118]:
# Set Paths
path = os.getcwd()
sourceDataPath = path + '\\CleanData\\'
exportPath = path +'\\Export\\'

print(f"Source Data:",sourceDataPath)
print(f"Exports:",exportPath)

Source Data: C:\Users\TheCu\OneDrive\Documents\Grad-School-Docs\CapstoneProject\Repo\CleanData\
Exports: C:\Users\TheCu\OneDrive\Documents\Grad-School-Docs\CapstoneProject\Repo\Export\


# Reusable Functions:

In [119]:
# Takes a DF and Producecs a Report of Null Values
def nullAnalysis(df):
    row = []
    null_summaryDF = pd.DataFrame(columns = ["Variable","Observations","Nulls","Null_Per",
                                             "Num_Unique","Type","MedVal","MinVal","MaxVal",])
    for column in df:
        var = column
        countOfObs = len(df[var])
        countOfNull = df[var].isnull().sum()
        perOfNull = round((100 * countOfNull) / countOfObs,3)
        numUnique = df[var].nunique()
        type  = df[var].dtypes
        med = df[var].median()
        min = df[var].min()
        max = df[var].max()
        # Append to Summary DF
        row = [var, countOfObs,countOfNull,perOfNull,numUnique,type, med, min,max]
        df_length = len(null_summaryDF)
        null_summaryDF.loc[df_length] = row
    # Get Summary Stats
    countofVars = len(null_summaryDF)
    countofNotNull = (null_summaryDF['Nulls'] == 0).sum()
    countofNull = (null_summaryDF['Nulls'] != 0).sum()
    print(f"Count of Variables:",countofVars)
    print(f"Count of Variables without nulls:",countofNotNull)
    print(f"Count of Variables with null :",countofNull)
    # Format Summary
    summaryDF = null_summaryDF[null_summaryDF['Nulls'] != 0].sort_values(by=['Nulls'],ascending=False)
    return summaryDF

In [120]:
# apply the z-score method in Pandas using the .mean() and .std() methods
def z_score(df,scaleList):
    # copy the dataframe
    df_std = df.copy()
    # apply the z-score method
    for column in scaleList:
        df_std[column] = (df_std[column] - df_std[column].mean()) / df_std[column].std()
        df_std[column] = round(df_std[column],5)
    return df_std

# Get and Prep Data

In [121]:
# Get Source Data
file = 'completeDF.csv'
origDF = pd.read_csv(sourceDataPath+file,skipinitialspace = True)
origDF = origDF.drop(columns=['Unnamed: 0'])
origDF


Unnamed: 0,gvkey,tic,curncd,exesign,currtr,src,auop,gsubind,aoloch,aoloch_std,...,st_volatility,sec_ajexm,sec_ajpm,sec_trfm_mean,sec_trfm_std,sec_trt1m_mean,sec_trt1m_std,rat_spcsrc,lawsuit,SettlementAmount
0,1239,ACV,USD,1,1.000000,5,2,30302010,1.9000,,...,0.14,1.000000,1.000000,6.001949,0.026279,1.631413,6.320251,6.0,1.0,0.0
1,1266,ALCO,USD,1,1.000000,5,1,30202010,1.2535,1.830697,...,0.27,1.000000,1.000000,1.803951,0.017539,0.991502,9.322428,3.0,0.0,
2,1408,BEAM,USD,1,1.000000,8,1,30201020,-45.6000,97.666281,...,0.89,1.000000,1.000000,19.151070,2.316089,2.017711,6.373380,4.0,0.0,
3,1429,2388B,USD,1,1.000000,3,1,30202030,24.1015,58.012527,...,,,,,,,,0.0,0.0,
4,1659,ANDE,USD,1,1.000000,5,1,30101020,2.3535,175.842783,...,0.29,1.453704,1.453704,1.274316,0.020235,2.559811,9.131795,6.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
343,221545,IBA,MXN,1,0.076348,53,2,30202030,-12.6520,25.701164,...,0.32,1.000000,1.000000,1.756264,0.051788,1.996371,6.788384,0.0,0.0,
344,222519,GMK,MXN,1,0.076348,53,2,30202030,84.9260,90.800682,...,0.70,1.000000,1.000000,1.204377,0.000000,4.203464,11.024610,0.0,0.0,
345,241637,BUD,USD,1,1.000000,53,1,30201010,524.5000,318.249902,...,0.28,1.000000,1.000000,1.043322,0.030272,1.827778,5.822912,0.0,0.0,
346,264393,IVFH,USD,1,1.000000,5,1,30101020,-0.1110,0.151167,...,1.49,0.473704,0.473704,1.000000,0.000000,6.590749,28.817567,0.0,0.0,


In [122]:
# Set aside variables that may be valuable for looks/interpretation of results, but not modeling
orig_lookUps = origDF[['gvkey','tic','curncd','currtr']].copy()

# delete those from origDF
origDF = origDF.drop(columns=['tic','curncd','currtr'])
orig_lookUps



Unnamed: 0,gvkey,tic,curncd,currtr
0,1239,ACV,USD,1.000000
1,1266,ALCO,USD,1.000000
2,1408,BEAM,USD,1.000000
3,1429,2388B,USD,1.000000
4,1659,ANDE,USD,1.000000
...,...,...,...,...
343,221545,IBA,MXN,0.076348
344,222519,GMK,MXN,0.076348
345,241637,BUD,USD,1.000000
346,264393,IVFH,USD,1.000000


In [123]:
# Scale Data -- Create List for Scaling

## Create list of exceptions for scaling
scaleException = ['gvkey','gsubind','exesign','src','auop','rest_count','rest_count_of_diffs','rest_a_count_of_diffs',
                  'st_per_growth','st_per_currentToMax','st_per_lowToStart','st_volatility','rat_spcsrc',
                  'lawsuit','SettlementAmount',
                  'invch']

## Remove exceptions from scale list
orig_scaleList = origDF.columns.tolist()
for col in scaleException:
    orig_scaleList.remove(col)

# Scale Data -- Call the z_score function
origDF_scaled = z_score(origDF,orig_scaleList)
origDF_scaled

Unnamed: 0,gvkey,exesign,src,auop,gsubind,aoloch,aoloch_std,at,at_std,bkvlps,...,st_volatility,sec_ajexm,sec_ajpm,sec_trfm_mean,sec_trfm_std,sec_trt1m_mean,sec_trt1m_std,rat_spcsrc,lawsuit,SettlementAmount
0,1239,1,5,2,30302010,-0.04005,,-0.24066,,-0.07855,...,0.14,-0.12927,-0.12920,1.16874,-0.18568,-0.12335,-0.14995,6.0,1.0,0.0
1,1266,1,5,1,30202010,-0.04529,-0.39016,-0.32871,-0.37853,-0.07853,...,0.27,-0.12927,-0.12920,-0.06238,-0.19987,-0.12808,-0.14500,3.0,0.0,
2,1408,1,8,1,30201020,-0.42531,0.12443,0.10983,0.83463,-0.07843,...,0.89,-0.12927,-0.12920,5.02489,3.53195,-0.12050,-0.14987,4.0,0.0,
3,1429,1,3,1,30202030,0.14002,-0.08850,-0.29217,-0.34639,,...,,,,,,,,0.0,0.0,
4,1659,1,5,1,30101020,-0.03637,0.54420,-0.23650,-0.22382,-0.07842,...,0.29,0.21630,0.21636,-0.21770,-0.19550,-0.11650,-0.14531,6.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
343,221545,1,53,2,30202030,-0.15808,-0.26199,-0.23758,-0.23000,-0.07844,...,0.32,-0.12927,-0.12920,-0.07636,-0.14427,-0.12066,-0.14918,0.0,0.0,
344,222519,1,53,2,30202030,0.63334,0.08756,-0.17066,-0.21925,-0.07860,...,0.70,-0.12927,-0.12920,-0.23821,-0.22835,-0.10436,-0.14218,0.0,0.0,
345,241637,1,53,1,30201010,4.19856,1.30885,5.82959,6.74613,-0.07847,...,0.28,-0.12927,-0.12920,-0.28544,-0.17920,-0.12190,-0.15077,0.0,0.0,
346,264393,1,5,1,30101020,-0.05636,-0.39918,-0.33830,-0.38165,-0.07865,...,1.49,-0.53012,-0.53005,-0.29814,-0.22835,-0.08673,-0.11280,0.0,0.0,


In [124]:
nullAnalysisDF = nullAnalysis(origDF_scaled)
nullAnalysisDF

Count of Variables: 164
Count of Variables without nulls: 42
Count of Variables with null : 122


Unnamed: 0,Variable,Observations,Nulls,Null_Per,Num_Unique,Type,MedVal,MinVal,MaxVal
163,SettlementAmount,348,302,86.782,12,float64,0.000000,0,4.75e+07
138,xacc_std,348,83,23.851,252,float64,-0.352410,-0.38644,7.48606
62,esubc_std,348,73,20.977,70,float64,-0.241590,-0.24159,6.72302
137,xacc,348,50,14.368,263,float64,-0.287175,-0.29895,11.7714
76,intpn_std,348,49,14.080,264,float64,-0.313340,-0.33014,11.4846
...,...,...,...,...,...,...,...,...,...
111,pstk,348,1,0.287,48,float64,-0.104900,-1.20429,16.9075
107,ppegt,348,1,0.287,279,float64,-0.265400,-0.28037,14.961
45,dpact,348,1,0.287,280,float64,-0.319190,-0.33252,11.9431
15,ceq,348,1,0.287,326,float64,-0.291460,-0.60197,8.63945


In [125]:
# Imputer TODO:
## Determine method to test how well it did
## Tune params
## Create Visuals to explain what it did.

## One option: https://scikit-learn.org/stable/modules/impute.html#impute

## Create DF w/o response Vars and a temp to rejoin after imputer runs
completeDF_Imputed = origDF_scaled.copy()
temp_Imputed_Resp = completeDF_Imputed[['gvkey','lawsuit','SettlementAmount']]
completeDF_Imputed = completeDF_Imputed.drop(columns=['lawsuit','SettlementAmount'])
temp_Imputed_Resp

Unnamed: 0,gvkey,lawsuit,SettlementAmount
0,1239,1.0,0.0
1,1266,0.0,
2,1408,0.0,
3,1429,0.0,
4,1659,0.0,
...,...,...,...
343,221545,0.0,
344,222519,0.0,
345,241637,0.0,
346,264393,0.0,


In [None]:
print("Starting Imputer...")
# Introduce IterativeImputer with an estimator and Fit
## Used as a guide, which states that ExtraTreesRegressor performs best:
## https://towardsdatascience.com/going-beyond-the-simpleimputer-for-missing-data-imputation-dd8ba168d505
imp = IterativeImputer(estimator=ExtraTreesRegressor(), max_iter=10, random_state=42)
imp.fit(completeDF_Imputed)

# Transform the dataset containing missing values
completeDF_Imputed = pd.DataFrame(imp.transform(completeDF_Imputed), columns = completeDF_Imputed.columns)
print("Done w/ Imputer")

# Rejoin Temp
completeDF_Imputed = completeDF_Imputed.merge(temp_Imputed_Resp, left_on='gvkey', right_on='gvkey')
completeDF_Imputed.shape

Starting Imputer...


In [None]:
nullAnalysisDF_scale = nullAnalysis(origDF_scaled)
nullAnalysisDF_scale = nullAnalysisDF_scale.sort_values(by='MaxVal', ascending=False)
nullAnalysisDF_scale

In [None]:
nullAnalysisDF = nullAnalysis(completeDF_Imputed)
nullAnalysisDF

In [None]:
completeDF_Imputed

In [None]:
# Exclude:
'SettlementAmount'

# Fix at Source:

