# Intro

This is the main notebook for the Capstone Project course

In [667]:
# Standard Libraries
import pandas as pd
import numpy as np
import os
import warnings
import json

# Plotting Libraries
import matplotlib.pyplot as plt
import seaborn as sns

# set Theme
plt.style.use('seaborn')
sns.set_theme(style="whitegrid")
warnings.filterwarnings('ignore')

In [668]:
# Set Paths
path = os.getcwd()
sourceDataPath = path +'\\SourceData\\'
exportPath = path +'\\Export\\'
trimmedCSVPath = path +'\\SourceData\\'
print(f"Source Data:",sourceDataPath)
print(f"Exports:",exportPath)
print(f"Trimmed CSV:",trimmedCSVPath)

Source Data: C:\Users\TheCu\OneDrive\Documents\Grad-School-Docs\CapstoneProject\Repo\SourceData\
Exports: C:\Users\TheCu\OneDrive\Documents\Grad-School-Docs\CapstoneProject\Repo\Export\
Trimmed CSV: C:\Users\TheCu\OneDrive\Documents\Grad-School-Docs\CapstoneProject\Repo\SourceData\


# Resuable Functions

In [669]:
# Create Large CSV Trim Function

def largeCSVTrim(largeCSVPath, CSVName, trimQuery):
    tempDF = pd.read_csv(largeCSVPath+CSVName)
    df = pd.concat((x.query(trimQuery) for x in tempDF), ignore_index=True)
    return df

In [670]:
# Takes a DF and Producecs a Report of Null Values
def nullAnalysis(df):
    row = []
    null_summaryDF = pd.DataFrame(columns = ["Variable","Observations","Nulls","Null_Per","Num_Unique","Type"])
    for column in df:
        var = column
        countOfObs = len(df[var])
        countOfNull = df[var].isnull().sum()
        perOfNull = round((100 * countOfNull) / countOfObs,3)
        numUnique = df[var].nunique()
        type  = df[var].dtypes
        # Append to Summary DF
        row = [var, countOfObs,countOfNull,perOfNull,numUnique,type]
        df_length = len(null_summaryDF)
        null_summaryDF.loc[df_length] = row
    # Get Summary Stats
    countofVars = len(null_summaryDF)
    countofNotNull = (null_summaryDF['Nulls'] == 0).sum()
    countofNull = (null_summaryDF['Nulls'] != 0).sum()
    print(f"Count of Variables:",countofVars)
    print(f"Count of Variables without nulls:",countofNotNull)
    print(f"Count of Variables with null :",countofNull)
    # Format Summary
    summaryDF = null_summaryDF[null_summaryDF['Nulls'] != 0].sort_values(by=['Nulls'],ascending=False)
    return summaryDF

In [671]:
#statRow = []
cleanActionsDF = pd.DataFrame(columns = ['DF','ActionDesc','List-of-Cols','ResultShape'])

def cleanActionsReport (df,df_name,actionDesc, cols):
    # Collect Stats
    row =[]
    actionDesc = actionDesc
    listCols = cols
    resultShape = df.shape
    # Collect Results
    row = [df_name,actionDesc, listCols, resultShape]
    # append stats to Summary Stats DF
    df = cleanActionsDF
    df_length = len(df)
    df.loc[df_length] = row
    return df

In [672]:
# Dropping Missing Values over 30% -- Will print out a table that shows what variables are dropped
def dropCols_nullThresh(df,thresh,df_name,actionDesc):
    thresh = 1-thresh
    column_names_before = df.columns.to_list()
    before = df.shape[1]
    df = df.dropna(thresh=df.shape[0]*thresh,how='all',axis=1)
    after = df.shape[1]
    column_names_after = df.columns.to_list()
    columns_dropped = []
    for i in column_names_before:
        if i not in column_names_after:
            columns_dropped.append(i)
    total_dropped = before - after
    df = df
    print(f"dropped: {total_dropped}")
    print(f"columns that where dropped where: {columns_dropped}")
    # record actions in summary report
    cleanActionsReport(df,df_name,actionDesc,columns_dropped)
    return df

In [673]:
# Dropping Missing Values over 30% -- Will print out a table that shows what variables are dropped
def dropCols_uniqueThresh(df,thresh,df_name,actionDesc):
    thresh = thresh
    column_names_before = df.columns.to_list()
    before = df.shape[1]
    df = df.drop(columns=df.columns[df.nunique()==1])
    after = df.shape[1]
    column_names_after = df.columns.to_list()
    columns_dropped = []
    for i in column_names_before:
        if i not in column_names_after:
            columns_dropped.append(i)
    total_dropped = before - after
    df = df
    print(f"dropped: {total_dropped}")
    print(f"columns that where dropped where: {columns_dropped}")
    # record actions in summary report
    cleanActionsReport(df,df_name,actionDesc,columns_dropped)
    return df


# Source Data

## Prep -- Trim Large CSV
(Note: Creates trimmed version of CSV's for import.
Only need to run this once, and then comment out to save time)

In [674]:
# Trim Securities_Full
largeCSVPath = 'C:/Users/TheCu/OneDrive/Documents/Grad-School-Docs/CapstoneProject/SourceData_Orig/'
CSVName = 'Securities_Full.csv'
trimQuery = "tic == 'SAM'"

print("Starting...")
# Securities_SAM = largeCSVTrim(largeCSVPath,CSVName,trimQuery) # Get Trimmed Data
# Securities_SAM.to_csv(trimmedCSVPath+'Securities_SAM.csv') # Export Trimmed Data

#View Trimmed Dataframe
print("Securities_SAM:")
# Securities_SAM.sample(3)

Starting...
Securities_SAM:


In [675]:
# Trim Fundamentals_Full
largeCSVPath = 'C:/Users/TheCu/OneDrive/Documents/Grad-School-Docs/CapstoneProject/SourceData_Orig/'
CSVName = 'Fundamentals_Full.csv'
trimQuery = "gsector == 30"

print("Starting...")
fundamentals = pd.read_csv(largeCSVPath+CSVName)
fundamentals['gsector'] = fundamentals['gsector'].astype(str)
fundamentals = fundamentals[fundamentals['gsector'] == '30']
fundamentals.to_csv(trimmedCSVPath+'Fundamentals_30.csv') # Export Trimmed Data

#View Trimmed Dataframe
print("Fundamentals_30:")
fundamentals.shape

Starting...
Fundamentals_30:


(2323, 1768)

In [676]:
# Trim Stocks_DS
largeCSVPath = 'C:/Users/TheCu/OneDrive/Documents/Grad-School-Docs/CapstoneProject/SourceData_Orig/'
CSVName = 'Stocks_DS.csv'
trimQuery = "tic == 'SAM'"

print("Starting...")
# Stocks_SAM = largeCSVTrim(largeCSVPath,CSVName,trimQuery) # Get Trimmed Data
# Stocks_SAM.to_csv(trimmedCSVPath+'Stocks_SAM.csv') # Export Trimmed Data

#View Trimmed Dataframe
print("Stocks_SAM:")
# Stocks_SAM.sample(3)

Starting...
Stocks_SAM:


## Get Source Data
(Note: Creates trimmed version of CSV's for import.
Only need to run this once, and then comment out to save time)

In [677]:
# Get Fundamentals
file = 'Fundamentals_30.csv'
fundamentals = pd.read_csv(sourceDataPath+file,skipinitialspace = True)
fundamentals.head()

Unnamed: 0.1,Unnamed: 0,gvkey,datadate,fyear,indfmt,consol,popsrc,datafmt,tic,conm,...,priusa,sic,spcindcd,spcseccd,spcsrc,state,stko,weburl,dldte,ipodate
0,218,1239,9/30/2010,2010,INDL,C,D,SUMM_STD,ACV,ALBERTO-CULVER CO,...,1,2844,215.0,978.0,A-,IL,0,www.alberto.com,5/11/2011,
1,219,1239,9/30/2010,2010,INDL,C,D,STD,ACV,ALBERTO-CULVER CO,...,1,2844,215.0,978.0,A-,IL,0,www.alberto.com,5/11/2011,
2,242,1266,9/30/2010,2010,INDL,C,D,SUMM_STD,ALCO,ALICO INC,...,1,100,112.0,970.0,B-,FL,0,www.alicoinc.com,,
3,243,1266,9/30/2010,2010,INDL,C,D,STD,ALCO,ALICO INC,...,1,100,112.0,970.0,B-,FL,0,www.alicoinc.com,,
4,244,1266,9/30/2011,2011,INDL,C,D,SUMM_STD,ALCO,ALICO INC,...,1,100,112.0,970.0,B-,FL,0,www.alicoinc.com,,


In [678]:
# Get Securities

In [679]:
# Get Stocks

In [680]:
# Get Ratings


# Data Clean-Up

## Fundamentals

In [681]:
# Drop Obvious Un-needed 1
cleanActionsDesc = "Dropped obvious low value columns -- 1"
df_name = "Fundamentals"

colsToDrop = ['datadate','apdedate','fdate','pdate','fyr','add1','addzip','busdesc','city','conml',
              'ein','fax','fyrc','incorp','loc','phone','state','weburl']

fundamentals = fundamentals.drop(columns=colsToDrop)
cleanActionsReport(fundamentals,df_name,cleanActionsDesc,colsToDrop)

Unnamed: 0,DF,ActionDesc,List-of-Cols,ResultShape
0,Fundamentals,Dropped obvious low value columns -- 1,"[datadate, apdedate, fdate, pdate, fyr, add1, ...","(2323, 1751)"


In [682]:
# Initial Column Reduce -- Drop Obvious -- Nulls
# Run Initial -- Null Report
fundamentals_nullReport = nullAnalysis(fundamentals)
fundamentals_nullReport

Count of Variables: 1751
Count of Variables without nulls: 25
Count of Variables with null : 1726


Unnamed: 0,Variable,Observations,Nulls,Null_Per,Num_Unique,Type
876,xstfo,2323,2323,100.000,0,float64
1387,isfi_dc,2323,2323,100.000,0,float64
704,tdscd,2323,2323,100.000,0,float64
705,tdsce,2323,2323,100.000,0,float64
706,tdsg,2323,2323,100.000,0,float64
...,...,...,...,...,...,...
346,ib,2323,67,2.884,1336,float64
713,teq,2323,60,2.583,1316,float64
88,at,2323,58,2.497,1310,float64
669,sale,2323,58,2.497,1328,float64


In [683]:
# Initial Column Reduce -- Drop Obvious -- Nulls
# Initial Null Drop -- More than 75% -- Will Re-Run w/ lower threshold after rows combined
cleanActionsDesc = "Initial Null Drop -- columns w/ 63% Null or more."
df_name = "Fundamentals"

fundamentals = dropCols_nullThresh(fundamentals,0.63,df_name,cleanActionsDesc)
fundamentals

dropped: 1418
columns that where dropped where: ['acctchg', 'acqmeth', 'adrr', 'bspr', 'compst', 'curuscn', 'ltcm', 'ogm', 'stalt', 'udpl', 'acco', 'acoxar', 'acqao', 'acqcshi', 'acqgdwl', 'acqic', 'acqintan', 'acqinvt', 'acqlntal', 'acqniintc', 'acqppe', 'acqsc', 'adpac', 'aedi', 'afudcc', 'afudci', 'amc', 'amdc', 'amgw', 'ano', 'aol2', 'apalch', 'apb', 'apc', 'apofs', 'aqa', 'aqd', 'aqeps', 'aqi', 'aqp', 'aqpl1', 'aqs', 'arb', 'arc', 'arce', 'arced', 'arceeps', 'artfs', 'aul3', 'autxr', 'balr', 'banlr', 'bast', 'bastr', 'batr', 'bcef', 'bclr', 'bcltbl', 'bcnlr', 'bcrbl', 'bct', 'bctbl', 'bctr', 'bltbl', 'ca', 'capr1', 'capr2', 'capr3', 'cb', 'cbi', 'cdpac', 'cdvc', 'cfbd', 'cfere', 'cfo', 'cfpdo', 'cga', 'cgri', 'cgti', 'cgui', 'chs', 'clfc', 'clfx', 'clg', 'clis', 'cll', 'cllc', 'clo', 'clrll', 'clt', 'cmp', 'cnltbl', 'cpcbl', 'cpdoi', 'cpnli', 'cppbl', 'cprei', 'crv', 'crvnli', 'cshrc', 'cshrp', 'cshrso', 'cshrt', 'cshrw', 'dbi', 'dcs', 'depc', 'dfpac', 'dfs', 'dfxa', 'dlcch', 'dlt

Unnamed: 0.1,Unnamed: 0,gvkey,fyear,indfmt,consol,popsrc,datafmt,tic,conm,acctstd,...,gsector,gsubind,idbflag,naics,priusa,sic,spcindcd,spcseccd,spcsrc,stko
0,218,1239,2010,INDL,C,D,SUMM_STD,ACV,ALBERTO-CULVER CO,,...,30,30302010,D,325620,1,2844,215.0,978.0,A-,0
1,219,1239,2010,INDL,C,D,STD,ACV,ALBERTO-CULVER CO,DS,...,30,30302010,D,325620,1,2844,215.0,978.0,A-,0
2,242,1266,2010,INDL,C,D,SUMM_STD,ALCO,ALICO INC,,...,30,30202010,D,111310,1,100,112.0,970.0,B-,0
3,243,1266,2010,INDL,C,D,STD,ALCO,ALICO INC,DS,...,30,30202010,D,111310,1,100,112.0,970.0,B-,0
4,244,1266,2011,INDL,C,D,SUMM_STD,ALCO,ALICO INC,,...,30,30202010,D,111310,1,100,112.0,970.0,B-,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2318,60832,264393,2013,INDL,C,D,SUMM_STD,IVFH,INNOVATIVE FOOD HOLDINGS,,...,30,30101020,D,4244,1,5140,,,,3
2319,60833,264393,2013,INDL,C,D,STD,IVFH,INNOVATIVE FOOD HOLDINGS,DS,...,30,30101020,D,4244,1,5140,,,,3
2320,61135,277487,2011,INDL,C,D,STD,LND,BRASILAGRO CIA BRAS DE PROP,DI,...,30,30202010,B,1111,90,100,,,,0
2321,61136,277487,2012,INDL,C,D,STD,LND,BRASILAGRO CIA BRAS DE PROP,DI,...,30,30202010,B,1111,90,100,,,,0


In [684]:
# Initial Column Reduce -- Drop Obvious -- Nulls
# Re-Run Null Report
fundamentals_nullReport = nullAnalysis(fundamentals)
fundamentals_nullReport

Count of Variables: 333
Count of Variables without nulls: 25
Count of Variables with null : 308


Unnamed: 0,Variable,Observations,Nulls,Null_Per,Num_Unique,Type
264,txds,2323,1462,62.936,443,float64
114,dxd5,2323,1459,62.807,318,float64
260,txdfed,2323,1458,62.764,547,float64
196,optex,2323,1457,62.721,644,float64
152,invfg,2323,1454,62.591,769,float64
...,...,...,...,...,...,...
141,ib,2323,67,2.884,1336,float64
247,teq,2323,60,2.583,1316,float64
40,at,2323,58,2.497,1310,float64
235,sale,2323,58,2.497,1328,float64


In [685]:
# Initial Column Reduce -- Drop Obvious -- Cols w/ 1 Value
cleanActionsDesc = "Dropped columns with only 1 unique value "
df_name = "Fundamentals"

fundamentals = dropCols_uniqueThresh(fundamentals,1,df_name,cleanActionsDesc)
fundamentals


dropped: 16
columns that where dropped where: ['indfmt', 'consol', 'popsrc', 'curcd', 'final', 'scf', 'acchg', 'aocisecgl', 'esopr', 'itcb', 'xoptd', 'xopteps', 'rank', 'rdipa_fn', 'stkco_fn', 'gsector']


Unnamed: 0.1,Unnamed: 0,gvkey,fyear,datafmt,tic,conm,acctstd,ajex,ajp,curncd,...,gind,gsubind,idbflag,naics,priusa,sic,spcindcd,spcseccd,spcsrc,stko
0,218,1239,2010,SUMM_STD,ACV,ALBERTO-CULVER CO,,,,,...,303020,30302010,D,325620,1,2844,215.0,978.0,A-,0
1,219,1239,2010,STD,ACV,ALBERTO-CULVER CO,DS,1.0,1.0,USD,...,303020,30302010,D,325620,1,2844,215.0,978.0,A-,0
2,242,1266,2010,SUMM_STD,ALCO,ALICO INC,,,,,...,302020,30202010,D,111310,1,100,112.0,970.0,B-,0
3,243,1266,2010,STD,ALCO,ALICO INC,DS,1.0,1.0,USD,...,302020,30202010,D,111310,1,100,112.0,970.0,B-,0
4,244,1266,2011,SUMM_STD,ALCO,ALICO INC,,,,,...,302020,30202010,D,111310,1,100,112.0,970.0,B-,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2318,60832,264393,2013,SUMM_STD,IVFH,INNOVATIVE FOOD HOLDINGS,,,,,...,301010,30101020,D,4244,1,5140,,,,3
2319,60833,264393,2013,STD,IVFH,INNOVATIVE FOOD HOLDINGS,DS,1.0,1.0,USD,...,301010,30101020,D,4244,1,5140,,,,3
2320,61135,277487,2011,STD,LND,BRASILAGRO CIA BRAS DE PROP,DI,1.0,1.0,BRR,...,302020,30202010,B,1111,90,100,,,,0
2321,61136,277487,2012,STD,LND,BRASILAGRO CIA BRAS DE PROP,DI,1.0,1.0,BRR,...,302020,30202010,B,1111,90,100,,,,0


In [686]:
cleanActionsDF

Unnamed: 0,DF,ActionDesc,List-of-Cols,ResultShape
0,Fundamentals,Dropped obvious low value columns -- 1,"[datadate, apdedate, fdate, pdate, fyr, add1, ...","(2323, 1751)"
1,Fundamentals,Initial Null Drop -- columns w/ 63% Null or more.,"[acctchg, acqmeth, adrr, bspr, compst, curuscn...","(2323, 333)"
2,Fundamentals,Dropped columns with only 1 unique value,"[indfmt, consol, popsrc, curcd, final, scf, ac...","(2323, 317)"


### Fundamentals Restatements

In [687]:
# make table w/ just restatement rows to get meaningful cols for a diff
## note came from a lengthy analysis -- analysis docs stored in repo
fund_restatement = fundamentals[['gvkey','fyear','datafmt','tic',
                                 'cogs','dp','mii','nopi','pi','txt',
                                 'wcap','xsga']]

# Make copy w/ cols to rejoin after diff
fund_restatement_rejoin = fund_restatement[['gvkey','fyear','datafmt','tic']]
# Drop string columns that can't be used for diff
fund_restatement = fund_restatement.drop(columns =['datafmt','tic'])
fund_restatement

Unnamed: 0,gvkey,fyear,cogs,dp,mii,nopi,pi,txt,wcap,xsga
0,1239,2010,734.501,26.658,0.000,-13.684,217.844,62.708,520.680,601.963
1,1239,2010,734.501,26.658,0.000,1.490,217.844,62.708,520.680,601.963
2,1266,2010,62.648,7.221,0.000,-1.412,-1.824,-1.201,29.529,6.458
3,1266,2010,62.648,7.221,0.000,1.688,-1.824,-1.201,29.529,6.458
4,1266,2011,67.832,7.327,0.000,-0.563,12.527,5.430,17.354,8.196
...,...,...,...,...,...,...,...,...,...,...
2318,264393,2013,16.854,0.263,0.000,0.000,-1.486,0.000,-0.174,5.420
2319,264393,2013,16.854,0.263,0.000,0.000,-1.486,0.000,-0.174,5.420
2320,277487,2011,30.511,9.560,-0.051,30.513,12.728,3.325,88.979,18.166
2321,277487,2012,54.097,14.382,-0.511,17.423,-9.679,-6.394,53.942,15.819


In [688]:
# Run Diff
fund_restatement_diff = fund_restatement.diff()
fund_restatement_diff = fund_restatement_diff.rename(columns={'gvkey':'gvkey_diff','fyear':'fyear_diff'})
fund_restatement_diff = fund_restatement_rejoin.join(fund_restatement_diff)
fund_restatement_diff


Unnamed: 0,gvkey,fyear,datafmt,tic,gvkey_diff,fyear_diff,cogs,dp,mii,nopi,pi,txt,wcap,xsga
0,1239,2010,SUMM_STD,ACV,,,,,,,,,,
1,1239,2010,STD,ACV,0.0,0.0,0.000,0.000,0.000,15.174,0.000,0.000,0.000,0.000
2,1266,2010,SUMM_STD,ALCO,27.0,0.0,-671.853,-19.437,0.000,-2.902,-219.668,-63.909,-491.151,-595.505
3,1266,2010,STD,ALCO,0.0,0.0,0.000,0.000,0.000,3.100,0.000,0.000,0.000,0.000
4,1266,2011,SUMM_STD,ALCO,0.0,1.0,5.184,0.106,0.000,-2.251,14.351,6.631,-12.175,1.738
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2318,264393,2013,SUMM_STD,IVFH,0.0,1.0,3.060,0.138,0.000,1.282,-3.516,0.000,-1.082,1.315
2319,264393,2013,STD,IVFH,0.0,0.0,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
2320,277487,2011,STD,LND,13094.0,-2.0,13.657,9.297,-0.051,30.513,14.214,3.325,89.153,12.746
2321,277487,2012,STD,LND,0.0,1.0,23.586,4.822,-0.460,-13.090,-22.407,-9.719,-35.037,-2.347


In [689]:
# Trim Diff -- only rows that compare same gvkey and same year
fund_restatement_diff = fund_restatement_diff[fund_restatement_diff['gvkey_diff'] == 0]
fund_restatement_diff = fund_restatement_diff[fund_restatement_diff['fyear_diff'] == 0]

# Add Sum Column
#TODO: Make Diffs Absolute so positive/negative don't cancel each other
colsToSum = ['cogs','dp','mii','nopi','pi','txt','wcap','xsga']
fund_restatement_diff['sum_diff'] = fund_restatement_diff[colsToSum].sum(axis=1)

#Drop Columns Where Sum of Diffs = 0
fund_restatement_diff = fund_restatement_diff[fund_restatement_diff['sum_diff'] > 0]

fund_restatement_diff

Unnamed: 0,gvkey,fyear,datafmt,tic,gvkey_diff,fyear_diff,cogs,dp,mii,nopi,pi,txt,wcap,xsga,sum_diff
1,1239,2010,STD,ACV,0.0,0.0,0.0,0.0,0.0,15.174,0.0,0.0,0.0,0.0,15.174
3,1266,2010,STD,ALCO,0.0,0.0,0.0,0.0,0.0,3.100,0.0,0.0,0.0,0.0,3.100
5,1266,2011,STD,ALCO,0.0,0.0,0.0,0.0,0.0,1.685,0.0,0.0,0.0,0.0,1.685
7,1266,2012,STD,ALCO,0.0,0.0,0.0,0.0,0.0,1.918,0.0,0.0,0.0,0.0,1.918
11,1408,2010,STD,BEAM,0.0,0.0,2697.5,141.2,8.4,15.800,241.9,55.4,,1325.1,4485.300
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2270,199456,2012,STD,RNDY,0.0,0.0,0.0,0.0,0.0,137.017,0.0,0.0,0.0,0.0,137.017
2272,199456,2013,STD,RNDY,0.0,0.0,0.0,0.0,0.0,2.494,0.0,0.0,0.0,0.0,2.494
2305,241637,2010,STD,BUD,0.0,0.0,0.0,0.0,0.0,1193.000,0.0,0.0,0.0,0.0,1193.000
2307,241637,2011,STD,BUD,0.0,0.0,0.0,0.0,0.0,818.000,0.0,0.0,0.0,0.0,818.000


In [690]:
# Create Diff Summary
fund_restatement_summary = fund_restatement_diff

## Add Column that counts non-zero's per row
fund_restatement_summary['count_of_diffs'] = fund_restatement_summary[colsToSum].gt(0).sum(axis=1)

# Drop Individual Counts, _diff columns, tic,
fund_restatement_summary = fund_restatement_summary.drop(columns=['datafmt','tic','gvkey_diff','fyear_diff',
                                                                  'cogs','dp','mii','nopi','pi','txt','wcap','xsga'])

# Group By -- Turn Year into count of years
fund_restatement_summary = fund_restatement_summary.groupby('gvkey')\
    .agg({'fyear':'count','sum_diff':'sum', 'count_of_diffs': 'sum'}).reset_index()
fund_restatement_summary = fund_restatement_summary.rename(columns={'fyear':'count_of_restatements'})
fund_restatement_summary

Unnamed: 0,gvkey,count_of_restatements,sum_diff,count_of_diffs
0,1239,1,15.174,1
1,1266,3,6.703,3
2,1408,3,4929.500,11
3,1429,1,0.063,2
4,1659,2,6.200,4
...,...,...,...,...
233,193157,3,1.372,7
234,196258,1,0.033,2
235,197956,3,6.665,13
236,199456,2,139.511,2


In [691]:
# Export Diff Reports to Excel

with pd.ExcelWriter(exportPath + 'Fundamentals_Diff_Reports.xlsx') as writer:
    fund_restatement_diff.to_excel(writer, sheet_name='All_Diffs')
    fund_restatement_summary.to_excel(writer, sheet_name='Summary')


### Fundamentals -- Resume Data Clean-up
Many columns could not be dropped until restatement was calculated

In [692]:
# drop restatement rows
## no longer useful since restatement data has been summarized
fundamentals = fundamentals[fundamentals['datafmt'] == 'STD']

cleanActionsDesc = 'Dropping all that are not standard statements -- value of others captured in restatement summary'
df_name = 'Fundamentals'
cols = 'applies to all cols, roughly 50% of rows dropped'

cleanActionsReport(fundamentals,df_name,cleanActionsDesc, cols)

Unnamed: 0,DF,ActionDesc,List-of-Cols,ResultShape
0,Fundamentals,Dropped obvious low value columns -- 1,"[datadate, apdedate, fdate, pdate, fyr, add1, ...","(2323, 1751)"
1,Fundamentals,Initial Null Drop -- columns w/ 63% Null or more.,"[acctchg, acqmeth, adrr, bspr, compst, curuscn...","(2323, 333)"
2,Fundamentals,Dropped columns with only 1 unique value,"[indfmt, consol, popsrc, curcd, final, scf, ac...","(2323, 317)"
3,Fundamentals,Dropping all that are not standard statements ...,"applies to all cols, roughly 50% of rows dropped","(1243, 317)"


In [693]:
# Initial Column Reduce -- Drop Obvious -- Nulls
# Initial Null Drop -- More than 75% -- Will Re-Run w/ lower threshold after rows combined
cleanActionsDesc = "Second Null Drop -- columns w/ 20% Null or more."
df_name = "Fundamentals"

fundamentals = dropCols_nullThresh(fundamentals,0.20,df_name,cleanActionsDesc)
fundamentals

dropped: 45
columns that where dropped where: ['cld2', 'cld3', 'cld4', 'cld5', 'cshr', 'dd3', 'dd4', 'dd5', 'dltp', 'dxd2', 'dxd3', 'dxd4', 'dxd5', 'fatb', 'fate', 'fato', 'fatp', 'invfg', 'invo', 'itci', 'mrc2', 'mrc3', 'mrc4', 'mrc5', 'mrct', 'optca', 'optex', 'optexd', 'optgr', 'optosby', 'optosey', 'recd', 'sppe', 'stkco', 'txc', 'txdfed', 'txdfo', 'txds', 'txfed', 'txs', 'xrent', 'mkvalt', 'spcindcd', 'spcseccd', 'spcsrc']


Unnamed: 0.1,Unnamed: 0,gvkey,fyear,datafmt,tic,conm,acctstd,ajex,ajp,curncd,...,rdipd_fn,rdipeps_fn,ggroup,gind,gsubind,idbflag,naics,priusa,sic,stko
1,219,1239,2010,STD,ACV,ALBERTO-CULVER CO,DS,1.0,1.0,USD,...,NR,NR,3030,303020,30302010,D,325620,1,2844,0
3,243,1266,2010,STD,ALCO,ALICO INC,DS,1.0,1.0,USD,...,NR,NR,3020,302020,30202010,D,111310,1,100,0
5,245,1266,2011,STD,ALCO,ALICO INC,DS,1.0,1.0,USD,...,NR,NR,3020,302020,30202010,D,111310,1,100,0
7,247,1266,2012,STD,ALCO,ALICO INC,DS,1.0,1.0,USD,...,NR,NR,3020,302020,30202010,D,111310,1,100,0
8,248,1266,2013,STD,ALCO,ALICO INC,DS,1.0,1.0,USD,...,NR,NR,3020,302020,30202010,D,111310,1,100,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2317,60831,264393,2012,STD,IVFH,INNOVATIVE FOOD HOLDINGS,DS,1.0,1.0,USD,...,NR,NR,3010,301010,30101020,D,4244,1,5140,3
2319,60833,264393,2013,STD,IVFH,INNOVATIVE FOOD HOLDINGS,DS,1.0,1.0,USD,...,NR,NR,3010,301010,30101020,D,4244,1,5140,3
2320,61135,277487,2011,STD,LND,BRASILAGRO CIA BRAS DE PROP,DI,1.0,1.0,BRR,...,NR,NR,3020,302020,30202010,B,1111,90,100,0
2321,61136,277487,2012,STD,LND,BRASILAGRO CIA BRAS DE PROP,DI,1.0,1.0,BRR,...,NR,NR,3020,302020,30202010,B,1111,90,100,0


In [694]:
cleanActionsDF

Unnamed: 0,DF,ActionDesc,List-of-Cols,ResultShape
0,Fundamentals,Dropped obvious low value columns -- 1,"[datadate, apdedate, fdate, pdate, fyr, add1, ...","(2323, 1751)"
1,Fundamentals,Initial Null Drop -- columns w/ 63% Null or more.,"[acctchg, acqmeth, adrr, bspr, compst, curuscn...","(2323, 333)"
2,Fundamentals,Dropped columns with only 1 unique value,"[indfmt, consol, popsrc, curcd, final, scf, ac...","(2323, 317)"
3,Fundamentals,Dropping all that are not standard statements ...,"applies to all cols, roughly 50% of rows dropped","(1243, 317)"
4,Fundamentals,Second Null Drop -- columns w/ 20% Null or more.,"[cld2, cld3, cld4, cld5, cshr, dd3, dd4, dd5, ...","(1243, 272)"


In [695]:
# Export Cleaned Data to CSV
fundamentals.to_csv(trimmedCSVPath+'fundamental_cleaned.csv') # Export Trimmed Data