# Intro:

This notebook is the second in the series for the Capstone Project.  "Capstone_Data-Prep" covers pulling in several
data sources, trimming to the needed observations/variables, joining together, and producing the source data or this
notebook.

In [130]:
# Standard Libraries
import pandas as pd
import numpy as np
import os
import warnings
import json
import math

# Plotting Libraries
import matplotlib.pyplot as plt
import seaborn as sns

#sklearn
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler


# set Theme
plt.style.use('seaborn')
sns.set_theme(style="whitegrid")
warnings.filterwarnings('ignore')

In [101]:
# Set Paths
path = os.getcwd()
sourceDataPath = path + '\\CleanData\\'
exportPath = path +'\\Export\\'

print(f"Source Data:",sourceDataPath)
print(f"Exports:",exportPath)

Source Data: C:\Users\TheCu\OneDrive\Documents\Grad-School-Docs\CapstoneProject\Repo\CleanData\
Exports: C:\Users\TheCu\OneDrive\Documents\Grad-School-Docs\CapstoneProject\Repo\Export\


# Reusable Functions:

In [102]:
# Takes a DF and Producecs a Report of Null Values
def nullAnalysis(df):
    row = []
    null_summaryDF = pd.DataFrame(columns = ["Variable","Observations","Nulls","Null_Per","Num_Unique","Type"])
    for column in df:
        var = column
        countOfObs = len(df[var])
        countOfNull = df[var].isnull().sum()
        perOfNull = round((100 * countOfNull) / countOfObs,3)
        numUnique = df[var].nunique()
        type  = df[var].dtypes
        # Append to Summary DF
        row = [var, countOfObs,countOfNull,perOfNull,numUnique,type]
        df_length = len(null_summaryDF)
        null_summaryDF.loc[df_length] = row
    # Get Summary Stats
    countofVars = len(null_summaryDF)
    countofNotNull = (null_summaryDF['Nulls'] == 0).sum()
    countofNull = (null_summaryDF['Nulls'] != 0).sum()
    print(f"Count of Variables:",countofVars)
    print(f"Count of Variables without nulls:",countofNotNull)
    print(f"Count of Variables with null :",countofNull)
    # Format Summary
    summaryDF = null_summaryDF[null_summaryDF['Nulls'] != 0].sort_values(by=['Nulls'],ascending=False)
    return summaryDF

In [None]:
# apply the z-score method in Pandas using the .mean() and .std() methods
def z_score(df,scaleList):
    # copy the dataframe
    df_std = df.copy()
    # apply the z-score method
    for column in scaleList:
        df_std[column] = (df_std[column] - df_std[column].mean()) / df_std[column].std()
    return df_std

# Get and Prep Data

In [103]:
# Get Source Data
file = 'completeDF.csv'
origDF = pd.read_csv(sourceDataPath+file,skipinitialspace = True)
origDF = origDF.drop(columns=['Unnamed: 0'])
origDF


Unnamed: 0,gvkey,tic,curncd,ceoso,cfoso,currtr,src,auop,aoloch,aoloch_std,...,st_volatility,sec_ajexm,sec_ajpm,sec_trfm_mean,sec_trfm_std,sec_trt1m_mean,sec_trt1m_std,rat_spcsrc,lawsuit,SettlementAmount
0,1239,ACV,USD,Y,Y,1.000000,5,4.0,2010.0,1.900000,...,0.14,1.000000,1.000000,6.001949,0.026279,1.631413,6.320251,6.0,1.0,0.0
1,1266,ALCO,USD,Y,Y,1.000000,5,1.0,2011.5,1.605750,...,0.27,1.000000,1.000000,1.803951,0.017539,0.991502,9.322428,3.0,,
2,1408,BEAM,USD,Y,Y,1.000000,8,1.0,2011.5,-34.975000,...,0.89,1.000000,1.000000,19.151070,2.316089,2.017711,6.373380,4.0,,
3,1429,2388B,USD,Y,Y,1.000000,3,1.0,2011.5,17.646250,...,,,,,,,,0.0,,
4,1659,ANDE,USD,Y,Y,1.000000,5,1.0,2011.5,48.783750,...,0.29,1.453704,1.453704,1.274316,0.020235,2.559811,9.131795,6.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
343,221545,IBA,MXN,Y,Y,0.076348,53,4.0,2011.5,-6.723750,...,0.32,1.000000,1.000000,1.756264,0.051788,1.996371,6.788384,0.0,,
344,222519,GMK,MXN,Y,Y,0.076348,53,4.0,2011.5,47.253500,...,0.70,1.000000,1.000000,1.204377,0.000000,4.203464,11.024610,0.0,,
345,241637,BUD,USD,Y,Y,1.000000,53,1.0,2011.5,447.500000,...,0.28,1.000000,1.000000,1.043322,0.030272,1.827778,5.822912,0.0,,
346,264393,IVFH,USD,Y,Y,1.000000,5,1.0,2011.5,-0.133000,...,1.49,0.473704,0.473704,1.000000,0.000000,6.590749,28.817567,0.0,,


In [104]:
# Set aside variables that may be valuable for looks/interpretation of results, but not modeling
orig_lookUps = origDF[['gvkey','tic','curncd','currtr']].copy()

# delete those from origDF
origDF = origDF.drop(columns=['tic','curncd','currtr'])
orig_lookUps


Unnamed: 0,gvkey,tic,curncd,currtr
0,1239,ACV,USD,1.000000
1,1266,ALCO,USD,1.000000
2,1408,BEAM,USD,1.000000
3,1429,2388B,USD,1.000000
4,1659,ANDE,USD,1.000000
...,...,...,...,...
343,221545,IBA,MXN,0.076348
344,222519,GMK,MXN,0.076348
345,241637,BUD,USD,1.000000
346,264393,IVFH,USD,1.000000


In [105]:
# Selective Fill Null's
colsToFillNa =['lawsuit','rat_spcsrc','rest_count','rest_sum_diff','rest_count_of_diffs',
               'rest_a_sum_diff','rest_a_count_of_diffs']
origDF[colsToFillNa] = origDF[colsToFillNa].fillna(0)

In [106]:
nullAnalysisDF = nullAnalysis(origDF)
nullAnalysisDF

Count of Variables: 166
Count of Variables without nulls: 43
Count of Variables with null : 123


Unnamed: 0,Variable,Observations,Nulls,Null_Per,Num_Unique,Type
165,SettlementAmount,348,302,86.782,12,float64
141,xint,348,83,23.851,257,float64
104,opeps_std,348,73,20.977,70,float64
70,gdwl_std,348,50,14.368,285,float64
111,prca,348,49,14.080,276,float64
...,...,...,...,...,...,...
44,dp_std,348,1,0.287,347,float64
56,epsfx_std,348,1,0.287,331,float64
58,epspi_std,348,1,0.287,56,float64
69,gdwl,348,1,0.287,347,float64


In [107]:
# Executive Sign-Off
## Confirmed that CEO and CFO sign-off values are always the same, so drop one and rename feature
origDF = origDF.drop(columns=['cfoso'])
origDF = origDF.rename(columns={'ceoso':'exesign'})

# Map Boolean Values to Numeric
## Contains 3 Values Yes, No, Empty.  Map Y to 1 and N/E to 0.

origDF['exesign'] = origDF['exesign'].map({'Y':1, 'N':0, 'E':0})
origDF['exesign'].value_counts(dropna=False)

1    327
0     21
Name: exesign, dtype: int64

In [108]:
# Auditor Opinion
## Contains 3 Possible Values: 0 = unaudited, 1 = 'Financial statements reflect no unresolvable restrictions
## and auditor has no significant exceptions', 4 = 'Auditor has expressed an unqualified opinion regarding the
## financial statements but has added explanatory language'

## Map values so they are continuous
origDF['auop'] = origDF['auop'].map({0:0,1:1,4:2})
origDF['auop'].value_counts(dropna=False)

1    251
2     93
0      4
Name: auop, dtype: int64

In [None]:
# Fill NaN
#TODO

## One option: https://scikit-learn.org/stable/modules/impute.html#impute
## Per Amie look at gsubind

In [138]:
# Scale Data -- Create List for Scaling

## Create list of exceptions for scaling
scaleException = ['gvkey','exesign','src','auop','rest_count','rest_count_of_diffs','rest_a_count_of_diffs',
                  'st_per_growth','st_per_currentToMax','st_per_lowToStart','st_volatility','rat_spcsrc',
                  'lawsuit','SettlementAmount',
                  'invch']

## Remove exceptions from scale list
orig_scaleList = origDF.columns.tolist()
for col in scaleException:
    orig_scaleList.remove(col)
orig_scaleList


['aoloch',
 'aoloch_std',
 'at',
 'at_std',
 'bkvlps',
 'bkvlps_std',
 'caps',
 'caps_std',
 'capx',
 'capx_std',
 'ceq',
 'ceq_std',
 'ceqt',
 'ceqt_std',
 'ch',
 'ch_std',
 'chech',
 'chech_std',
 'ci',
 'ci_std',
 'cibegni',
 'cibegni_std',
 'cogs',
 'cogs_std',
 'cshpri',
 'cshpri_std',
 'dcpstk',
 'dcpstk_std',
 'defrev',
 'defrev_std',
 'dilavx',
 'dilavx_std',
 'dlc',
 'dlc_std',
 'dltr',
 'dltr_std',
 'dltt',
 'dltt_std',
 'dp',
 'dp_std',
 'dpact',
 'dpact_std',
 'dvc',
 'dvc_std',
 'dvt',
 'dvt_std',
 'ebit',
 'ebit_std',
 'epsfi',
 'epsfi_std',
 'epsfx',
 'epsfx_std',
 'epspi',
 'epspi_std',
 'esopct',
 'esopct_std',
 'esubc',
 'esubc_std',
 'fincf',
 'fincf_std',
 'fopo',
 'fopo_std',
 'fyear',
 'fyear_std',
 'gdwl',
 'gdwl_std',
 'gp',
 'gp_std',
 'icapt',
 'icapt_std',
 'intan',
 'intan_std',
 'intpn',
 'intpn_std',
 'invch_std',
 'invt',
 'invt_std',
 'ivncf',
 'ivncf_std',
 'lct',
 'lct_std',
 'lse',
 'lse_std',
 'lt',
 'lt_std',
 'mii',
 'mii_std',
 'msa',
 'msa_std',


In [140]:
# Scale Data -- Call the z_score function
origDF_scaled = z_score(origDF,orig_scaleList)
origDF_scaled

Unnamed: 0,gvkey,exesign,src,auop,aoloch,aoloch_std,at,at_std,bkvlps,bkvlps_std,...,st_volatility,sec_ajexm,sec_ajpm,sec_trfm_mean,sec_trfm_std,sec_trt1m_mean,sec_trt1m_std,rat_spcsrc,lawsuit,SettlementAmount
0,1239,1,5,2,-2.165979,-0.045147,-0.240872,-0.091618,-0.111907,-0.251756,...,0.14,-0.129267,-0.129203,1.161057,-0.156014,-0.123352,-0.149953,6.0,1.0,0.0
1,1266,1,5,1,0.389142,-0.047593,-0.329164,-0.091594,-0.221375,-0.265453,...,0.27,-0.129267,-0.129203,-0.069325,-0.170802,-0.128078,-0.144995,3.0,0.0,
2,1408,1,8,1,0.389142,-0.351636,0.149313,-0.091485,-0.023377,-0.079706,...,0.89,-0.129267,-0.129203,5.014906,3.718561,-0.120499,-0.149865,4.0,0.0,
3,1429,1,3,1,0.389142,0.085729,-0.293072,,,-0.206168,...,,,,,,,,0.0,0.0,
4,1659,1,5,1,0.389142,0.344530,-0.235958,-0.091487,-0.182390,-0.145044,...,0.29,0.216296,0.216358,-0.224555,-0.166241,-0.116496,-0.145310,6.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
343,221545,1,53,2,0.389142,-0.116824,-0.237998,-0.091511,-0.211344,-0.220378,...,0.32,-0.129267,-0.129203,-0.083302,-0.112849,-0.120657,-0.149180,0.0,0.0,
344,222519,1,53,2,0.389142,0.331811,-0.163536,-0.091668,-0.223481,-0.131400,...,0.70,-0.129267,-0.129203,-0.245053,-0.200480,-0.104358,-0.142184,0.0,0.0,
345,241637,1,53,1,0.389142,3.658484,6.074357,-0.091524,3.910234,3.385005,...,0.28,-0.129267,-0.129203,-0.292256,-0.149257,-0.121902,-0.150774,0.0,0.0,
346,264393,1,5,1,0.389142,-0.062044,-0.338837,-0.091721,-0.222324,-0.281115,...,1.49,-0.530120,-0.530054,-0.304953,-0.200480,-0.086729,-0.112799,0.0,0.0,
