# Intro:

This notebook is the second in the series for the Capstone Project.  "Capstone_Data-Prep" covers pulling in several
data sources, trimming to the needed observations/variables, joining together, and producing the source data or this
notebook.

In [83]:
# Standard Libraries
import pandas as pd
import numpy as np
import os
import warnings
import json

# Plotting Libraries
import matplotlib.pyplot as plt
import seaborn as sns

#sklearn
from sklearn.preprocessing import OrdinalEncoder

# set Theme
plt.style.use('seaborn')
sns.set_theme(style="whitegrid")
warnings.filterwarnings('ignore')


In [84]:
# Set Paths
path = os.getcwd()
sourceDataPath = path + '\\CleanData\\'
exportPath = path +'\\Export\\'

print(f"Source Data:",sourceDataPath)
print(f"Exports:",exportPath)

Source Data: C:\Users\TheCu\OneDrive\Documents\Grad-School-Docs\CapstoneProject\Repo\CleanData\
Exports: C:\Users\TheCu\OneDrive\Documents\Grad-School-Docs\CapstoneProject\Repo\Export\


# Reusable Functions:

In [85]:
# Takes a DF and Producecs a Report of Null Values
def nullAnalysis(df):
    row = []
    null_summaryDF = pd.DataFrame(columns = ["Variable","Observations","Nulls","Null_Per","Num_Unique","Type"])
    for column in df:
        var = column
        countOfObs = len(df[var])
        countOfNull = df[var].isnull().sum()
        perOfNull = round((100 * countOfNull) / countOfObs,3)
        numUnique = df[var].nunique()
        type  = df[var].dtypes
        # Append to Summary DF
        row = [var, countOfObs,countOfNull,perOfNull,numUnique,type]
        df_length = len(null_summaryDF)
        null_summaryDF.loc[df_length] = row
    # Get Summary Stats
    countofVars = len(null_summaryDF)
    countofNotNull = (null_summaryDF['Nulls'] == 0).sum()
    countofNull = (null_summaryDF['Nulls'] != 0).sum()
    print(f"Count of Variables:",countofVars)
    print(f"Count of Variables without nulls:",countofNotNull)
    print(f"Count of Variables with null :",countofNull)
    # Format Summary
    summaryDF = null_summaryDF[null_summaryDF['Nulls'] != 0].sort_values(by=['Nulls'],ascending=False)
    return summaryDF

# Get and Prep Data

In [86]:
# Get Source Data
file = 'completeDF.csv'
origDF = pd.read_csv(sourceDataPath+file,skipinitialspace = True)
origDF = origDF.drop(columns=['Unnamed: 0'])
origDF


Unnamed: 0,gvkey,tic,curncd,ceoso,cfoso,currtr,src,auop,acominc,acominc_std,...,st_volatility,sec_ajexm,sec_ajpm,sec_trfm_mean,sec_trfm_std,sec_trt1m_mean,sec_trt1m_std,rat_spcsrc,lawsuit,SettlementAmount
0,1239,ACV,USD,Y,Y,1.000000,5,4.0,2010.0,-70.29700,...,0.14,1.000000,1.000000,6.001949,0.026279,1.631413,6.320251,6.0,1.0,0.0
1,1266,ALCO,USD,Y,Y,1.000000,5,1.0,2011.5,0.00000,...,0.27,1.000000,1.000000,1.803951,0.017539,0.991502,9.322428,3.0,,
2,1408,BEAM,USD,Y,Y,1.000000,8,1.0,2011.5,-202.22500,...,0.89,1.000000,1.000000,19.151070,2.316089,2.017711,6.373380,4.0,,
3,1429,2388B,USD,Y,Y,1.000000,3,1.0,2011.5,-77.61875,...,,,,,,,,0.0,,
4,1659,ANDE,USD,Y,Y,1.000000,5,1.0,2011.5,-34.61225,...,0.29,1.453704,1.453704,1.274316,0.020235,2.559811,9.131795,6.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
343,221545,IBA,MXN,Y,Y,0.076348,53,4.0,2011.5,-2.70650,...,0.32,1.000000,1.000000,1.756264,0.051788,1.996371,6.788384,0.0,,
344,222519,GMK,MXN,Y,Y,0.076348,53,4.0,2011.5,-7.69025,...,0.70,1.000000,1.000000,1.204377,0.000000,4.203464,11.024610,0.0,,
345,241637,BUD,USD,Y,Y,1.000000,53,1.0,2011.5,790.00000,...,0.28,1.000000,1.000000,1.043322,0.030272,1.827778,5.822912,0.0,,
346,264393,IVFH,USD,Y,Y,1.000000,5,1.0,2011.5,0.00000,...,1.49,0.473704,0.473704,1.000000,0.000000,6.590749,28.817567,0.0,,


In [87]:
# Set aside variables that may be valuable for looks/interpretation of results, but not modeling
orig_lookUps = origDF[['gvkey','tic','curncd','currtr']].copy()

# delete those from origDF
origDF = origDF.drop(columns=['tic','curncd','currtr'])
orig_lookUps


Unnamed: 0,gvkey,tic,curncd,currtr
0,1239,ACV,USD,1.000000
1,1266,ALCO,USD,1.000000
2,1408,BEAM,USD,1.000000
3,1429,2388B,USD,1.000000
4,1659,ANDE,USD,1.000000
...,...,...,...,...
343,221545,IBA,MXN,0.076348
344,222519,GMK,MXN,0.076348
345,241637,BUD,USD,1.000000
346,264393,IVFH,USD,1.000000


In [88]:
# Selective Fill Null's
colsToFillNa =['lawsuit','rat_spcsrc','rest_count','rest_sum_diff','rest_count_of_diffs',
               'rest_a_sum_diff','rest_a_count_of_diffs']
origDF[colsToFillNa] = origDF[colsToFillNa].fillna(0)

In [89]:
nullAnalysisDF = nullAnalysis(origDF)
nullAnalysisDF

Count of Variables: 184
Count of Variables without nulls: 46
Count of Variables with null : 138


Unnamed: 0,Variable,Observations,Nulls,Null_Per,Num_Unique,Type
183,SettlementAmount,348,302,86.782,12,float64
158,xint_std,348,83,23.851,257,float64
154,wcap_std,348,81,23.276,125,float64
117,pnca,348,73,20.977,70,float64
75,fopo,348,54,15.517,135,float64
...,...,...,...,...,...,...
63,epsfi,348,1,0.287,56,float64
29,chech,348,1,0.287,327,float64
65,epsfx,348,1,0.287,334,float64
78,fyear_std,348,1,0.287,347,float64


In [90]:
# Executive Sign-Off
## Confirmed that CEO and CFO sign-off values are always the same, so drop one and rename feature
origDF = origDF.drop(columns=['cfoso'])
origDF = origDF.rename(columns={'ceoso':'exesign'})

# Map Boolean Values to Numeric
## Contains 3 Values Yes, No, Empty.  Map Y to 1 and N/E to 0.

origDF['exesign'] = origDF['exesign'].map({'Y':1, 'N':0, 'E':0})
origDF['exesign'].value_counts(dropna=False)

1    327
0     21
Name: exesign, dtype: int64

In [91]:
# Auditor Opinion
## Contains 3 Possible Values: 0 = unaudited, 1 = 'Financial statements reflect no unresolvable restrictions
## and auditor has no significant exceptions', 4 = 'Auditor has expressed an unqualified opinion regarding the
## financial statements but has added explanatory language'

## Map values so they are continuous
origDF['auop'] = origDF['auop'].map({0:0,1:1,4:2})
origDF['auop'].value_counts(dropna=False)



1    251
2     93
0      4
Name: auop, dtype: int64

In [93]:
# Scale Data
