In [None]:
"""Progetto_gruppo3.ipynb
"""

#%% IMPORT SECTION
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn import ensemble 
from sklearn import tree

#%% DIRECTORY SETTINGS
# Change work directory
os.chdir('//srv0001/Risorse/Public/Gruppo3-Python')
cwd = os.getcwd()
print(cwd)

#%% PREPARE FUNCTION
def prepare_data(data):
   
    features = data.drop(["REF_DATE", "SEDOLCHK","ICB_INDUSTRY_NUM", "ICB_SECTOR_NUM", "IS_FINANCIAL"], axis=1).astype("float64")
        
    return features


In [None]:
#%% IMPORT DATA
#Import .csv
clean_data = pd.read_csv(r'Data/clean.csv', sep=';')
companynames = pd.read_csv(r'Data/companynames.csv', sep=',')
companynames = companynames.drop_duplicates(subset = "SEDOLCHK")
print(clean_data.head(10))

filtered_data = clean_data.copy()
filtered_data.loc[filtered_data.ICB_INDUSTRY_NUM==8000,'IS_FINANCIAL'] = 1
filtered_data = filtered_data.fillna(0)

In [None]:
#%% K-FOLD VALIDATION
valid = filtered_data.copy().to_numpy()
N = valid.shape[0]
K = 100                                          # number of folds
preds_kfold = np.empty(N)
folds = np.random.randint(0, K, size=N)

for idx in np.arange(K):
    # For each fold, break your data into training and testing subsets
    data_train = pd.DataFrame(valid[folds != idx,:],columns=['REF_DATE','SEDOLCHK','SIMPLE_TOT_RET','COMPOUND_TOT_RET','FIVE_YR_VOLATILITY_M','FIVE_YR_VOLATILITY_W','ADY','AERR','AGRE','APE','CGR','CV3Y_EPS','CV3Y_EREV_FY1','CV3Y_REP_EPS','CV3Y_REV_MARK','CV5Y_EPS','CV6M_P','CV6M_PM6M','DEBT_MKT_CAP','DY_FWD','DY_TRL','EBIT_SALES','ECM','EEREV','EPS_SIGMA','EPSDISP','EREV','F2GRE','FDY','FERR','FGRE','FPE','FWD_ECM','FWD_GRE','LRE','MKT_CAP_SALES','NM','PB','PM6MBIS','PM6MTRIS','PCF','PCTCHG_DPS','PCTCHG_EBIT','PCTCHG_EBIT_MARGIN','PCTCHG_EBITDA','PCTCHG_EQUITY','PCTCHG_NET_DEBT','PCTCHG_NM','PCTCHG_PRE_TAX_PROFIT','PCTCHG_ROE','PCTCHG_SALES','PCTCHG_UL_SALES','PE_FWD','PE_TRL','PM6M','RC1MEEREV','RC1MEREV','REC_MARK_1M','REC_MARK_3M','REV_MARK_1M','REV_MARK_3M','REV_STD_MARK','ROE','TRL_ECM','TRL_GRE','TURNOVER_1M','UL_SALES','PB_PCF_1','PB_PCF_2','ICB_INDUSTRY_NUM','ICB_SECTOR_NUM','IS_FINANCIAL'])
    data_test  = pd.DataFrame(valid[folds == idx,:],columns=['REF_DATE','SEDOLCHK','SIMPLE_TOT_RET','COMPOUND_TOT_RET','FIVE_YR_VOLATILITY_M','FIVE_YR_VOLATILITY_W','ADY','AERR','AGRE','APE','CGR','CV3Y_EPS','CV3Y_EREV_FY1','CV3Y_REP_EPS','CV3Y_REV_MARK','CV5Y_EPS','CV6M_P','CV6M_PM6M','DEBT_MKT_CAP','DY_FWD','DY_TRL','EBIT_SALES','ECM','EEREV','EPS_SIGMA','EPSDISP','EREV','F2GRE','FDY','FERR','FGRE','FPE','FWD_ECM','FWD_GRE','LRE','MKT_CAP_SALES','NM','PB','PM6MBIS','PM6MTRIS','PCF','PCTCHG_DPS','PCTCHG_EBIT','PCTCHG_EBIT_MARGIN','PCTCHG_EBITDA','PCTCHG_EQUITY','PCTCHG_NET_DEBT','PCTCHG_NM','PCTCHG_PRE_TAX_PROFIT','PCTCHG_ROE','PCTCHG_SALES','PCTCHG_UL_SALES','PE_FWD','PE_TRL','PM6M','RC1MEEREV','RC1MEREV','REC_MARK_1M','REC_MARK_3M','REV_MARK_1M','REV_MARK_3M','REV_STD_MARK','ROE','TRL_ECM','TRL_GRE','TURNOVER_1M','UL_SALES','PB_PCF_1','PB_PCF_2','ICB_INDUSTRY_NUM','ICB_SECTOR_NUM','IS_FINANCIAL'])


In [None]:
%% PREPARE FEATURES
features = prepare_data(data_train)
print(features)

for col in data_train.columns:
    if data_train[col].dtype==object and col!='SEDOLCHK':
        data_train[col]=pd.to_numeric(data_train[col],errors='coerce')
        
for col in data_test.columns:
    if data_test[col].dtype==object and col!='SEDOLCHK':
        data_test[col]=pd.to_numeric(data_test[col],errors='coerce')


In [None]:
#%% SINGLE TREE
model = tree.DecisionTreeClassifier(max_depth = None)
model.fit(features, data_train["IS_FINANCIAL"])
print(model.score(prepare_data(data_train), data_train["IS_FINANCIAL"]))
model.score(prepare_data(data_test), data_test["IS_FINANCIAL"])

plt.figure(figsize=[30.0, 30.0])
tree.plot_tree(model, feature_names=features.columns)

In [None]:
#%% GET THE OUTLIERS SEDOLCHK & names
predicted = model.predict(prepare_data(data_test))
g_truth = data_test["IS_FINANCIAL"].to_numpy(copy=True)
outliers_sedol = pd.DataFrame(data_test["SEDOLCHK"][np.where(predicted != g_truth)[0]]).reset_index().drop('index', axis =1 )


outliers = outliers_sedol.merge(companynames, left_on='SEDOLCHK', right_on='SEDOLCHK')

print(outliers)

In [None]:
#%% RANDOM FOREST
forest = ensemble.RandomForestClassifier(n_estimators=500, bootstrap=True, max_features=None)
forest.fit(features, data_train["IS_FINANCIAL"])
print(forest.score(prepare_data(data_train), data_train["IS_FINANCIAL"]))
forest.score(prepare_data(data_test), data_test["IS_FINANCIAL"])   

In [None]:
#%% GET THE OUTLIERS SEDOLCHK
predicted_f = forest.predict(prepare_data(data_test))
g_truth = data_test["IS_FINANCIAL"].to_numpy(copy=True)
outliers_sedol_f = pd.DataFrame(data_test["SEDOLCHK"][np.where(predicted_f != g_truth)[0]]).reset_index().drop('index', axis =1 )

outliers_f = outliers_sedol_f.merge(companynames, left_on='SEDOLCHK', right_on='SEDOLCHK')

print(outliers_f)