# Predicting Greatstone Rating for Mutual Funds 

In [None]:
## Import library
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.simplefilter('ignore')

import os
pd.set_option('display.max_columns',100)
pd.set_option('display.max_rows',200)

from sklearn.metrics import make_scorer,accuracy_score,average_precision_score

from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import KNNImputer

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.linear_model import *
from sklearn.tree import *
import lightgbm as lgbm
from lightgbm.sklearn import LGBMClassifier
import catboost
from catboost import CatBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC

# explicitly require this experimental feature
from sklearn.experimental import enable_hist_gradient_boosting  
# now you can import normally from ensemble
from sklearn.ensemble import HistGradientBoostingClassifier

from sklearn.ensemble import VotingClassifier

In [None]:
display(os.getcwd())
os.chdir('/Users/ankitpatel/Downloads/external/')

In [None]:
os.listdir()

In [None]:
## read data ##
rt_3yr=pd.read_csv('return_3year.csv')
rt_5yr=pd.read_csv('return_5year.csv')
rt_10yr=pd.read_csv('return_10year.csv')
fund_allo=pd.read_csv('fund_allocations.csv')
fund_conf=pd.read_csv('fund_config.csv')
fund_specs=pd.read_csv('fund_specs.csv')
other_specs=pd.read_csv('other_specs.csv')
bond_ratings=pd.read_csv('bond_ratings.csv')
fund_ratios=pd.read_csv('fund_ratios.csv')
sample_sub=pd.read_csv('sample_submission.csv')

In [None]:
display(rt_3yr.shape)
display(rt_5yr.shape)
display(rt_10yr.shape)
display(fund_allo.shape)
display(fund_conf.shape)
display(fund_specs.shape)
display(other_specs.shape)
display(bond_ratings.shape)
display(fund_ratios.shape)
display(sample_sub.shape)

## Data Frame :: rt_3yr

In [None]:
## Analyze data one by one ##
rt_3yr.info()

In [None]:
## convert 3yrs_treynor_ratio_fund to float ##
rt_3yr['3yrs_treynor_ratio_fund']=rt_3yr['3yrs_treynor_ratio_fund'].str.replace(",","")

In [None]:
rt_3yr['3yrs_treynor_ratio_fund'].unique()

In [None]:
rt_3yr.isna().sum()

In [None]:
rt_3yr['3yrs_treynor_ratio_fund']=rt_3yr['3yrs_treynor_ratio_fund'].astype(float)

In [None]:
## impute nulls ##

cols=['3yrs_treynor_ratio_fund','3_years_alpha_fund','3years_category_std','3yrs_sharpe_ratio_fund','3yrs_treynor_ratio_category',\
     '3_years_return_mean_annual_fund','fund_beta_3years','3years_fund_r_squared','3years_fund_std','category_beta_3years',\
     'fund_return_3years','3_years_alpha_category','3_years_return_mean_annual_category','3yrs_sharpe_ratio_category',\
     '3years_category_r_squared','3_years_return_category']

from sklearn.impute import SimpleImputer
mp = SimpleImputer(missing_values=np.nan, strategy='median')
mp.fit(rt_3yr[cols])

In [None]:
rt_3yr_imputed=pd.DataFrame(mp.fit_transform(rt_3yr[cols]),columns=cols)

In [None]:
rt_3yr

In [None]:
rt_3yr_imputed

In [None]:
## drop the columns
rt_3yr.drop(cols,axis=1,inplace=True)

In [None]:
rt_3yr_imputed=pd.concat([rt_3yr,rt_3yr_imputed],axis=1)

In [None]:
rt_3yr_imputed.head(10)

In [None]:
display(rt_3yr_imputed.isna().sum())
display(rt_3yr_imputed.info())

## Data Frame :: rt_5yr

In [None]:
## Analyze data one by one ##
rt_5yr.info()

In [None]:
rt_5yr.head()

In [None]:
## convert 5yrs_treynor_ratio_fund to float ##
rt_5yr['5yrs_treynor_ratio_fund']=rt_5yr['5yrs_treynor_ratio_fund'].str.replace(",","")
rt_5yr['5yrs_treynor_ratio_fund']=rt_5yr['5yrs_treynor_ratio_fund'].astype(float)

In [None]:
rt_5yr.columns

In [None]:
## impute nulls ##

cols=['category_r_squared_5years', '5yrs_sharpe_ratio_fund',
       '5_years_alpha_fund', '5years_fund_r_squared', '5years_fund_std',
       '5yrs_sharpe_ratio_category', '5_years_beta_fund',
       '5yrs_treynor_ratio_fund', '5_years_return_mean_annual_fund',
       '5_years_return_mean_annual_category', '5yrs_treynor_ratio_category',
       '5_years_return_fund', '5_years_alpha_category',
       '5_years_beta_category', '5years_category_std',
       '5_years_return_category']

from sklearn.impute import SimpleImputer
mp = SimpleImputer(missing_values=np.nan, strategy='median')
mp.fit(rt_5yr[cols])

In [None]:
rt_5yr_imputed=pd.DataFrame(mp.fit_transform(rt_5yr[cols]),columns=cols)

In [None]:
## drop the columns
rt_5yr.drop(cols,axis=1,inplace=True)

In [None]:
rt_5yr_imputed=pd.concat([rt_5yr,rt_5yr_imputed],axis=1)

In [None]:
display(rt_5yr.head(10))
display(rt_5yr_imputed.head(10))
display(rt_5yr_imputed.isna().sum())
display(rt_5yr_imputed.info())

## DataFrame :: rt_10yr

In [None]:
## Analyze data one by one ##
rt_10yr.info()

In [None]:
rt_10yr.head(10)

In [None]:
## convert 10yrs_treynor_ratio_fund to float ##
rt_10yr['10yrs_treynor_ratio_fund']=rt_10yr['10yrs_treynor_ratio_fund'].str.replace(",","")
rt_10yr['10yrs_treynor_ratio_fund']=rt_10yr['10yrs_treynor_ratio_fund'].astype(float)

In [None]:
rt_10yr.columns

In [None]:
## impute nulls ##

cols=['10years_category_r_squared', '10yrs_sharpe_ratio_fund',
       '10_years_alpha_fund', '10years_fund_r_squared', '10years_fund_std',
       '10yrs_sharpe_ratio_category', '10_years_beta_fund',
       '10yrs_treynor_ratio_fund',
       '10_years_return_mean_annual_category', '10yrs_treynor_ratio_category',
       '10_years_return_fund', '10_years_alpha_category',
       '10_years_beta_category', '10years_category_std',
       '10_years_return_mean_annual_fund', '10_years_return_category']

from sklearn.impute import SimpleImputer
mp = SimpleImputer(missing_values=np.nan, strategy='median')
mp.fit(rt_10yr[cols])

In [None]:
rt_10yr_imputed=pd.DataFrame(mp.fit_transform(rt_10yr[cols]),columns=cols)

In [None]:
## drop the columns
rt_10yr.drop(cols,axis=1,inplace=True)

In [None]:
rt_10yr_imputed=pd.concat([rt_10yr,rt_10yr_imputed],axis=1)

In [None]:
display(rt_10yr.head(10))
display(rt_10yr_imputed.head(10))
display(rt_10yr_imputed.isna().sum())
display(rt_10yr_imputed.info())

## DataFrame :: fund_allo

In [None]:
fund_allo.info()

In [None]:
fund_allo.head()

In [None]:
fund_allo[fund_allo.isna()].head(20)

In [None]:
fund_specs['tag'].nunique()

In [None]:
## No Need to impute as ID's are also null ##

## DataFrame :: fund_conf

In [None]:
fund_conf.info()

In [None]:
fund_conf.head()

In [None]:
## No need to impute ##

## DataFrame :: bond_ratings

In [None]:
bond_ratings.info(verbose=True,null_counts=True)
#bond_ratings.info(verbose=True)

In [None]:
bond_ratings.head()

In [None]:
bond_ratings['maturity_bond'].unique()

In [None]:
bond_ratings[bond_ratings['bb_rating'].isna()]

In [None]:
## Drop the records with Tag available and all other records as null ##
bond_ratings.index=bond_ratings['tag']

In [None]:
bond_ratings.drop('tag',axis=1,inplace=True)

In [None]:
bond_ratings.dropna(axis=0,inplace=True,how="all")

In [None]:
bond_ratings.shape

In [None]:
bond_ratings.isna().sum()

In [None]:
## check the distribution of maturity_bond & duration_bond
sns.scatterplot(x='maturity_bond',y='duration_bond',data=bond_ratings)

In [None]:
## Using KNN imputer ##
from sklearn.impute import KNNImputer

In [None]:
bond_ratings.columns

In [None]:
cols=['bb_rating', 'us_govt_bond_rating', 'below_b_rating', 'others_rating',
       'maturity_bond', 'b_rating', 'a_rating', 'aaa_rating', 'aa_rating',
       'bbb_rating', 'duration_bond']
knnimputer = KNNImputer(n_neighbors=2, weights="uniform")
knnimputer.fit(bond_ratings[cols])

In [None]:
bond_ratings_imputed=pd.DataFrame(knnimputer.fit_transform(bond_ratings[cols]),columns=cols)

In [None]:
bond_ratings_imputed

In [None]:
bond_ratings['tag']=bond_ratings.index

In [None]:
bond_ratings.reset_index(inplace=True,drop=True)

In [None]:
bond_ratings

In [None]:
## drop the columns
bond_ratings.drop(cols,axis=1,inplace=True)

In [None]:
bond_ratings_imputed=pd.concat([bond_ratings,bond_ratings_imputed],axis=1)

In [None]:
display(bond_ratings_imputed.head(10))
display(bond_ratings_imputed.isna().sum())

## DataFrame :: fund_ratios

In [None]:
fund_ratios.info()

In [None]:
fund_ratios['tag'].nunique()

In [None]:
fund_ratios.head(100)

In [None]:
fund_ratios['mmc']=fund_ratios['mmc'].str.replace(',','')

In [None]:
fund_ratios['mmc']=fund_ratios['mmc'].astype(float)

In [None]:
fund_ratios['ps_ratio']=fund_ratios['ps_ratio'].str.replace(',','')
fund_ratios['ps_ratio']=fund_ratios['ps_ratio'].astype(float)
fund_ratios['pc_ratio']=fund_ratios['pc_ratio'].str.replace(',','')
fund_ratios['pc_ratio']=fund_ratios['pc_ratio'].astype(float)
fund_ratios['pe_ratio']=fund_ratios['pe_ratio'].str.replace(',','')
fund_ratios['pe_ratio']=fund_ratios['pe_ratio'].astype(float)

In [None]:
fund_ratios[fund_ratios['pb_ratio'].isna()]

## DataFrame :: fund_specs

In [None]:
fund_specs.info()

In [None]:
fund_specs[fund_specs['tag']==202562].head()

In [None]:
fund_specs['fund_size'].unique()

In [None]:
## List of categorical variable ##
var=['investment_class','fund_size']
encoder = OrdinalEncoder()
imputer = KNNImputer()

def encode(data):
    '''function to encode non-null data and replace it in the original data'''
    #retains only non-null values
    nonulls = np.array(data.dropna())
    #reshapes the data for encoding
    impute_reshape = nonulls.reshape(-1,1)
    #encode date
    impute_ordinal = encoder.fit_transform(impute_reshape)
    #Assign back encoded values to non-null values
    data.loc[data.notnull()] = np.squeeze(impute_ordinal)
    return data

#create a for loop to iterate through each column in the data
for columns in var:
    encode(fund_specs[columns])

In [None]:
fund_specs

In [None]:
cols=['investment_class','total_assets','yield','fund_size','return_ytd']
# impute data and convert 
encode_data = pd.DataFrame(np.round(imputer.fit_transform(fund_specs[cols])),columns = cols)

In [None]:
encode_data

In [None]:
## drop columns from main table 
fund_specs.drop(['investment_class','total_assets','yield','fund_size','return_ytd'],axis=1,inplace=True)

In [None]:
fund_specs_imputed=pd.concat([fund_specs,encode_data],axis=1)

In [None]:
fund_specs_imputed

In [None]:
fund_specs_imputed.isna().sum()

## DataFrame :: other_specs

In [None]:
other_specs.info()

In [None]:
## convert object to float ##
other_specs['pc_ratio']=other_specs['pc_ratio'].str.replace(',','')
other_specs['pe_ratio']=other_specs['pe_ratio'].str.replace(',','')
other_specs['mmc']=other_specs['mmc'].str.replace(',','')
other_specs['ps_ratio']=other_specs['ps_ratio'].str.replace(',','')

In [None]:
other_specs['pc_ratio']=other_specs['pc_ratio'].astype(float)
other_specs['pe_ratio']=other_specs['pe_ratio'].astype(float)
other_specs['mmc']=other_specs['mmc'].astype(float)
other_specs['ps_ratio']=other_specs['ps_ratio'].astype(float)

In [None]:
other_specs['tag'].nunique()

In [None]:
other_specs.head()

In [None]:
## categorical variables 
## Assumptions : years_down: number of years the mutual fund was down
other_specs['years_down'].unique()

In [None]:
other_specs['years_down'].isna().sum()

In [None]:
## Assumptions : years_up: number of years the mutual fund was up
other_specs['years_up'].unique()

In [None]:
other_specs['years_up'].isna().sum()

In [None]:
other_specs['portfolio_convertable'].unique()

In [None]:
### Replace all nan in years_doen and years_up with 0
other_specs['years_up']=other_specs['years_up'].replace(np.nan,0)

In [None]:
other_specs['years_down']=other_specs['years_down'].replace(np.nan,0)

In [None]:
other_specs.columns

In [None]:
## Impute all the other columns ##
cols=['2014_category_return', '2012_return_category',
       '2018_return_category', 'category_return_1year',
       'cash_percent_of_portfolio', 'pc_ratio', '2011_return_category',
       'ytd_return_fund', '2014_return_fund',
       'category_return_1month', '2013_return_fund', 'fund_return_3months',
       'ytd_return_category', 'pb_ratio', '2017_category_return',
       '1_year_return_fund', 'pe_ratio', '2015_return_fund',
       'portfolio_convertable', '3_months_return_category', 'portfolio_others',
       '2016_return_fund', 'mmc', 'stock_percent_of_portfolio',
       '2016_return_category', 'ps_ratio', '2011_return_fund',
       '2010_return_fund', 'fund_return_3years', '2012_fund_return',
       '2018_return_fund', '2017_return_fund',
        'category_return_2015',
       '1_month_fund_return', 'bond_percentage_of_porfolio',
       'portfolio_preferred', '2010_return_category', '2013_category_return']
imputer = KNNImputer()
# impute data and convert 
other_specs_imputed = pd.DataFrame(np.round(imputer.fit_transform(other_specs[cols])),columns = cols)

In [None]:
other_specs_imputed

In [None]:
## drop columns from main table 
other_specs.drop(cols,axis=1,inplace=True)

In [None]:
other_specs

In [None]:
other_specs_imputed=pd.concat([other_specs,other_specs_imputed],axis=1)

In [None]:
other_specs_imputed.info()

## Merging of the data

In [None]:
## Final List of tables after imputation ##
#1)rt_3yr_imputed
#2)rt_5yr_imputed
#3)rt_10yr_imputed
#4)fund_allo
#5)fund_conf
#6)bond_ratings_imputed
#7)fund_ratios
#8)fund_specs_imputed
#9)other_specs_imputed

In [None]:
display(other_specs_imputed.columns)
display(fund_specs_imputed.columns)

In [None]:
## Merging other_specs with funds_specs
## Join fund_spaecs and Other_specs on Tag ##
master_df = fund_specs_imputed.merge(other_specs_imputed,left_on='tag', right_on='tag', how='left',suffixes=['_x','_y'])
pd.set_option('display.max_columns',1000)
display(master_df)

In [None]:
## Drop the common column ##
master_df.drop('greatstone_rating_y',axis=1,inplace=True)

In [None]:
master_df.info()

In [None]:
# Rename column
master_df.rename(columns={'greatstone_rating_x':'greatstone_rating'},inplace=True)

In [None]:
fund_ratios.columns

In [None]:
## Join master_df and fund_ratios on tag ##
master_df = master_df.merge(fund_ratios,left_on='tag', right_on='tag', how='left',suffixes=['_x','_y'])
pd.set_option('display.max_columns',1000)
display(master_df)
display(master_df.shape)

In [None]:
## drop duplicate columns 
master_df.drop(['pc_ratio_y','pb_ratio_y','pe_ratio_y','ps_ratio_y','mmc_y'],axis=1,inplace=True)

In [None]:
master_df.info()

In [None]:
# rename the columns #
master_df.rename(columns={'pc_ratio_x':'pc_ratio','pb_ratio_x':'pb_ratio','pe_ratio_x':'pe_ratio','ps_ratio_x':'ps_ratio'},\
                inplace=True)

In [None]:
fund_allo.columns

In [None]:
## Join master_df and fund_allot_df on tag ##
master_df = master_df.merge(fund_allo,left_on='tag', right_on='id', how='left',suffixes=['_x','_y'])
pd.set_option('display.max_columns',1000)
display(master_df)
display(master_df.shape)

In [None]:
## Drop ID column 
master_df.drop('id',axis=1,inplace=True)

In [None]:
master_df.info()

In [None]:
## rename mmc_x to mmc
master_df.rename(columns={'mmc_x':'mmc'},inplace=True)

In [None]:
fund_conf.columns

In [None]:
## Join master_df and fund_config_df on tag ##
master_df = master_df.merge(fund_conf,left_on='fund_id', right_on='fund_id', how='left',suffixes=['_x','_y'])
pd.set_option('display.max_columns',1000)
display(master_df)
display(master_df.shape)

In [None]:
master_df.info()

In [None]:
bond_ratings_imputed.columns

In [None]:
## Join master_df and bond_rating_df on tag ##
master_df = master_df.merge(bond_ratings_imputed,left_on='tag', right_on='tag', how='left',suffixes=['_x','_y'])
pd.set_option('display.max_columns',1000)
display(master_df)
display(master_df.shape)

In [None]:
rt_3yr_imputed.columns

In [None]:
## Join master_df and return_3_df on tag ##
master_df = master_df.merge(rt_3yr_imputed,left_on='tag', right_on='tag', how='left',suffixes=['_x','_y'])
pd.set_option('display.max_columns',1000)
display(master_df)
display(master_df.shape)

In [None]:
master_df.info()

In [None]:
rt_5yr_imputed.columns

In [None]:
## Join master_df and return_5_df on tag ##
master_df = master_df.merge(rt_5yr_imputed,left_on='tag', right_on='tag', how='left',suffixes=['_x','_y'])
pd.set_option('display.max_columns',1000)
display(master_df)
display(master_df.shape)

In [None]:
master_df.info(verbose=True)

In [None]:
rt_10yr_imputed.columns

In [None]:
## Join master_df and return_10_df on tag ##
master_df = master_df.merge(rt_10yr_imputed,left_on='fund_id', right_on='fund_id', how='left',suffixes=['_x','_y'])
pd.set_option('display.max_columns',1000)
display(master_df)
display(master_df.shape)

In [None]:
list(master_df)

## Check on the columns 

In [None]:
master_df.isna().sum()

In [None]:
master_df[master_df['portfolio_financial_services'].isna()]

In [None]:
### Columns : 'portfolio_communication_allocation',
 #'portfolio_financial_services',
 #'portfolio_industrials_allocation',
 #'portfolio_tech_allocation',
 #'portfolio_materials_basic_allocation',
 #'portfolio_energy_allocation',
 #'portfolio_consumer_defence_allocation',
 #'portfolio_healthcare_allocation',
 #'portfolio_property_allocation',
 #'portfolio_utils_allocation',
 #'portfolio_cyclical_consumer_allocation'
 #'portfolio_communication_allocation'
#posses nulls bcoz of their master file having tag null

In [None]:
master_df['portfolio_financial_services'].isna().sum()

In [None]:
## Replacing the na's in the above columns with 0 
master_df['portfolio_financial_services'].replace(np.nan,0,inplace=True)
master_df['portfolio_industrials_allocation'].replace(np.nan,0,inplace=True)
master_df['portfolio_tech_allocation'].replace(np.nan,0,inplace=True)
master_df['portfolio_materials_basic_allocation'].replace(np.nan,0,inplace=True)
master_df['portfolio_energy_allocation'].replace(np.nan,0,inplace=True)
master_df['portfolio_consumer_defence_allocation'].replace(np.nan,0,inplace=True)
master_df['portfolio_healthcare_allocation'].replace(np.nan,0,inplace=True)
master_df['portfolio_property_allocation'].replace(np.nan,0,inplace=True)
master_df['portfolio_utils_allocation'].replace(np.nan,0,inplace=True)
master_df['portfolio_cyclical_consumer_allocation'].replace(np.nan,0,inplace=True)
master_df['portfolio_communication_allocation'].replace(np.nan,0,inplace=True)

In [None]:
master_df.to_csv('Master_hack.csv')

In [None]:
master_df.head()

In [None]:
## Rename the column with investmentclass and fundsize
display(master_df['investment_class'].unique())
#0:Blend
#1:Growth
#2:Value

display(master_df['fund_size'].unique())
#0:Large
#1:Medium
#2:Small

display(master_df['category'].nunique())
display(master_df['parent_company'].nunique())
display(master_df['fund_name'].nunique())

In [None]:
master_df.info()

In [None]:
master_df['investment_class']=master_df['investment_class'].astype('object')
master_df['fund_size']=master_df['fund_size'].astype('object')

In [None]:
cols=['investment_class','fund_size']

In [None]:
dummies=pd.get_dummies(master_df[cols],prefix_sep='_')

In [None]:
## append the dummies column to the master_df
master_df=pd.concat([master_df,dummies],axis=1)

In [None]:
master_df.info(verbose=True)

In [None]:
##droping 'investment_class','fund_size'
master_df.drop(['investment_class','fund_size'],axis=1,inplace=True)

In [None]:
## Import submission data and separating out the submission data from master_Df
sample_sub.head()

In [None]:
master_df_train_test=master_df[master_df['greatstone_rating'].notna()]

In [None]:
master_df_train_test.shape

In [None]:
master_df_train_test['greatstone_rating'].isna().sum()

In [None]:
master_df_sub=master_df[master_df['greatstone_rating'].isna()]

In [None]:
master_df_sub.shape

In [None]:
## Validating the submission file with master_df_sub
sample_sub=sample_sub.merge(master_df_sub,left_on='fund_id',right_on='fund_id',how='left')

In [None]:
sample_sub.shape

In [None]:
sample_sub.head()

In [None]:
## Dropping duplicate column#
sample_sub.drop('greatstone_rating_y',axis=1,inplace=True)

In [None]:
sample_sub.to_csv('sample_sub.csv')
master_df_train_test.to_csv('master_df_train_test.csv')

In [None]:
os.getcwd()

In [None]:
os.listdir()

## Data Exploration 

In [None]:
master_df_train_test=pd.read_csv('master_df_train_test.csv')

In [None]:
master_df_train_test.drop('Unnamed: 0',axis=1,inplace=True)

In [None]:
master_df_train_test.info(verbose=True)

In [None]:
master_df_train_test['greatstone_rating']=master_df_train_test['greatstone_rating'].astype('int')

In [None]:
list(master_df_train_test.columns)

## Build a benchmark model :with default hyperparameters Random forest 

In [None]:
master_df_train_test.head(10)

In [None]:
cols=['yield',
 'return_ytd',
 'years_up',
 'years_down',
 '2014_category_return',
 '2012_return_category',
 '2018_return_category',
 'category_return_1year',
 'cash_percent_of_portfolio',
 'pc_ratio',
 '2011_return_category',
 'ytd_return_fund',
 '2014_return_fund',
 'category_return_1month',
 '2013_return_fund',
 'fund_return_3months',
 'ytd_return_category',
 'pb_ratio',
 '2017_category_return',
 '1_year_return_fund',
 'pe_ratio',
 '2015_return_fund',
 'portfolio_convertable',
 '3_months_return_category',
 'portfolio_others',
 '2016_return_fund',
 'mmc',
 'stock_percent_of_portfolio',
 '2016_return_category',
 'ps_ratio',
 '2011_return_fund',
 '2010_return_fund',
 'fund_return_3years_x',
 '2012_fund_return',
 '2018_return_fund',
 '2017_return_fund',
 'category_return_2015',
 '1_month_fund_return',
 'bond_percentage_of_porfolio',
 'portfolio_preferred',
 '2010_return_category',
 '2013_category_return',
 'fund_ratio_net_annual_expense',
 'portfolio_communication_allocation',
 'portfolio_financial_services',
 'portfolio_industrials_allocation',
 'portfolio_tech_allocation',
 'portfolio_materials_basic_allocation',
 'portfolio_energy_allocation',
 'portfolio_consumer_defence_allocation',
 'portfolio_healthcare_allocation',
 'portfolio_property_allocation',
 'portfolio_utils_allocation',
 'portfolio_cyclical_consumer_allocation',
 'bb_rating',
 'us_govt_bond_rating',
 'below_b_rating',
 'others_rating',
 'maturity_bond',
 'b_rating',
 'a_rating',
 'aaa_rating',
 'aa_rating',
 'bbb_rating',
 'duration_bond',
 '3yrs_treynor_ratio_fund',
 '3_years_alpha_fund',
 '3years_category_std',
 '3yrs_sharpe_ratio_fund',
 '3yrs_treynor_ratio_category',
 '3_years_return_mean_annual_fund',
 'fund_beta_3years',
 '3years_fund_r_squared',
 '3years_fund_std',
 'category_beta_3years',
 'fund_return_3years_y',
 '3_years_alpha_category',
 '3_years_return_mean_annual_category',
 '3yrs_sharpe_ratio_category',
 '3years_category_r_squared',
 '3_years_return_category',
 'category_r_squared_5years',
 '5yrs_sharpe_ratio_fund',
 '5_years_alpha_fund',
 '5years_fund_r_squared',
 '5years_fund_std',
 '5yrs_sharpe_ratio_category',
 '5_years_beta_fund',
 '5yrs_treynor_ratio_fund',
 '5_years_return_mean_annual_fund',
 '5_years_return_mean_annual_category',
 '5yrs_treynor_ratio_category',
 '5_years_return_fund',
 '5_years_alpha_category',
 '5_years_beta_category',
 '5years_category_std',
 '5_years_return_category',
 '10years_category_r_squared',
 '10yrs_sharpe_ratio_fund',
 '10_years_alpha_fund',
 '10years_fund_r_squared',
 '10years_fund_std',
 '10yrs_sharpe_ratio_category',
 '10_years_beta_fund',
 '10yrs_treynor_ratio_fund',
 '10_years_return_mean_annual_category',
 '10yrs_treynor_ratio_category',
 '10_years_return_fund',
 '10_years_alpha_category',
 '10_years_beta_category',
 '10years_category_std',
 '10_years_return_mean_annual_fund',
 '10_years_return_category',
 'investment_class_0.0',
 'investment_class_1.0',
 'investment_class_2.0',
 'fund_size_0.0',
 'fund_size_1.0',
 'fund_size_2.0',
 'total_assets',
 'category_ratio_net_annual_expense']

In [None]:
master_df_train_test.info(verbose=True)

In [None]:
master_df_train_test['greatstone_rating'].unique()

In [None]:
X=master_df_train_test[cols]
y=master_df_train_test['greatstone_rating']

In [None]:
display(X.shape)
display(y.shape)

In [None]:
pd.set_option('display.max_columns',150)
X.head(2)

In [None]:
## Scale the whole data ##
scaler = StandardScaler()
X= scaler.fit_transform(X)

In [None]:
## split into train and test 
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=123)
# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
rf=RandomForestClassifier(n_estimators=100)

In [None]:
display(X_train.shape)
display(X_test.shape)
display(y_train.shape)
display(y_test.shape)

In [None]:
rf.fit(X_train,y_train)

In [None]:
rf.fit(X,y)

In [None]:
feat_imp=rf.feature_importances_
fi=pd.DataFrame({'Feature':cols,'imp':feat_imp})

In [None]:
fi

In [None]:
fi.sort_values(by=['imp'],ascending=False)

In [None]:
## predict on Y_test
pred=rf.predict(X_test)

In [None]:
## Predict on sample_sub
pred_rf=rf.predict(sample_sub[cols])

In [None]:
pred

In [None]:
pred_rf

In [None]:
print(rf.score(X_test,y_test))

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [None]:
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))

## predict on submission data 

In [None]:
sample_sub=pd.read_csv('sample_sub.csv')

In [None]:
sample_sub.columns

In [None]:
sample_sub.drop(['Unnamed: 0'],axis=1,inplace=True)

In [None]:
pred_rf=rf.predict(sample_sub[cols])

In [None]:
pred_rf

In [None]:
##Random forest
#sample_sub['greatstone_rating']=pred_rf
##Extra tree classifier
#sample_sub['greatstone_rating']=pred_etc
## gbc
#sample_sub['greatstone_rating']=pred_gbc
## gbc with tuning 
#sample_sub['greatstone_rating']=pred_gbc_t
## random forest w/o tuning 
sample_sub['greatstone_rating']=pred_rf

In [None]:
sample_sub.head()

In [None]:
##iter 1
#sample_sub[['fund_id','greatstone_rating']].to_csv('sample_submission_v1.csv',index=False)
##iter 2
#sample_sub[['fund_id','greatstone_rating']].to_csv('sample_submission_v2.csv',index=False)
##iter 3
#sample_sub[['fund_id','greatstone_rating']].to_csv('sample_submission_v3.csv',index=False)
##iter 4
#sample_sub[['fund_id','greatstone_rating']].to_csv('sample_submission_v4.csv',index=False)
##iter 5
#sample_sub[['fund_id','greatstone_rating']].to_csv('sample_submission_v5.csv',index=False)
##iter 8
#sample_sub[['fund_id','greatstone_rating']].to_csv('sample_submission_v8.csv',index=False)
##iter 9
#sample_sub[['fund_id','greatstone_rating']].to_csv('sample_submission_v9.csv',index=False)
##iter 10
#sample_sub[['fund_id','greatstone_rating']].to_csv('sample_submission_v10.csv',index=False)
##iter 10
sample_sub[['fund_id','greatstone_rating']].to_csv('sample_submission_v11.csv',index=False)

## Random Forest with tuning 

In [None]:
params={'n_estimators':[100,200,300,400],
        'max_depth':[4,6,8,10],
        'min_samples_split':[0.1,0.2,0.3,0.4,0.5],
        'min_samples_leaf':[0.1,0.2,0.3,0.4,0.5],
        'max_features':['auto','sqrt','log2']}
#score=make_scorer(score_func=average_precision_score)

rf=RandomForestClassifier(oob_score=True,n_jobs=-1,random_state=123)
rf_t=GridSearchCV(rf,param_grid=params,cv=5,n_jobs=-1,verbose=1)

In [None]:
rf_t.fit(X,y)

In [None]:
rf_t.best_params_

In [None]:
rf_t_best=RandomForestClassifier(n_estimators=400,max_depth=4,min_samples_split=0.1,
                                 min_samples_leaf=0.1,max_features='auto',
                                 oob_score=True,n_jobs=-1,random_state=123)
rf_t_best.fit(X,y)

In [None]:
feat_imp=rf_t_best.feature_importances_
fi=pd.DataFrame({'Feature':cols,'imp':feat_imp})
display(fi.sort_values(by='imp',ascending=False).head(150))

In [None]:
sample_sub=pd.read_csv('sample_sub.csv')

In [None]:
rf_t_pred=rf_t_best.predict(sample_sub[cols])

In [None]:
rf_t_pred

In [None]:
sample_sub.drop(['greatstone_rating_x','Unnamed: 0'],axis=1,inplace=True)
sample_sub['greatstone_rating']=rf_t_pred

In [None]:
sample_sub.head()

In [None]:
##iter 7
sample_sub[['fund_id','greatstone_rating']].to_csv('sample_submission_v7.csv',index=False)

## ExtraTreeClassifier 

In [None]:
etc=ExtraTreesClassifier(n_estimators=100,random_state=123)

In [None]:
etc.fit(X,y)

In [None]:
pred_etc=etc.predict(sample_sub[cols])

In [None]:
pred_etc

## Gradient Boosting Classifier

### Hyperparameter tuning 

In [None]:
params={'n_estimators':[100,200,500,1000],
        'learning_rate':[0.05,0.1,0.2,0.3,0.4,0.5],
        'min_samples_split':[0.2,0.4,0.6,0.8],
        'min_samples_leaf':[0.2,0.4,0.6,0.8],
        'max_depth':[5,10,15,20],
        'max_features':['auto','log2','sqrt']}
gbc=GradientBoostingClassifier(n_iter_no_change=10,validation_fraction=0.2,random_state=123)

In [None]:
rs=RandomizedSearchCV(gbc,param_distributions=params,n_iter=100,cv=5,n_jobs=-1,verbose=1,random_state=123)
rs.fit(X,y)

In [None]:
rs.best_params_

In [None]:
## fit with best params ##
rs_gbc=GradientBoostingClassifier(n_estimators=500,min_samples_split=0.4,\
                                  min_samples_leaf=0.2,max_features='auto',\
                                  max_depth=20,learning_rate=0.5,\
                                  random_state=123)
rs_gbc.fit(X,y)

In [None]:
#### Fitted without hyperparamater tuning ####
gbc.fit(X,y)

In [None]:
#pred_gbc=gbc.predict(sample_sub[cols])
## tunning
pred_gbc_t=rs_gbc.predict(sample_sub[cols])

In [None]:
pred_gbc_t

## OnevsrestClassifier

In [None]:
#rf=RandomForestClassifier(n_estimators=100)
lgbm_t=LGBMClassifier(boosting_type='dart',learning_rate=0.05,n_estimators=2500
                      ,reg_lambda=0.2,min_child_samples=30,
                      max_depth=-1,random_state=1111)
ovr=OneVsRestClassifier(estimator=lgbm_t,n_jobs=-1)
ovr.fit(X,y)

In [None]:
sample_sub=pd.read_csv('sample_sub.csv')
ovr_pred=ovr.predict(sample_sub[cols])

In [None]:
ovr_pred

In [None]:
sample_sub.drop(['Unnamed: 0'],axis=1,inplace=True)
sample_sub['greatstone_rating']=ovr_pred

In [None]:
sample_sub.head()

In [None]:
##iter 6
sample_sub[['fund_id','greatstone_rating']].to_csv('sample_submission_v6.csv',index=False)

## LinearSVC

In [None]:
lsvc=LinearSVC(penalty='l2',multi_class='ovr',fit_intercept=False,random_state=123)

lsvc.fit(X,y)

In [None]:
lsvc.coef_

In [None]:
lsvc_pred=lsvc.predict(sample_sub[cols])

In [None]:
lsvc_pred

## RidgeClassifier

In [None]:
rc=RidgeClassifier()
rc.fit(X,y)

In [None]:
sample_sub=pd.read_csv('sample_sub.csv')
rc_pred=rc.predict(sample_sub[cols])

In [None]:
rc_pred

In [None]:
sample_sub.drop(['greatstone_rating_x','Unnamed: 0'],axis=1,inplace=True)
sample_sub['greatstone_rating']=rc_pred

In [None]:
sample_sub.head()

In [None]:
##iter 8
sample_sub[['fund_id','greatstone_rating']].to_csv('sample_submission_v8.csv',index=False)

In [None]:
### Transforming the variables ##
pd.set_option('display.max_columns',150)
master_df_train_test.head()

In [None]:
master_df_train_test.shape[0]

In [None]:
master_df_train_test[master_df_train_test['2018_return_category']>0][i].count()

In [None]:
## get all the columns with value>0
pos_cols=[]
for i in cols:
    if master_df_train_test[master_df_train_test[i]>0][i].count()==master_df_train_test.shape[0]:
        pos_cols.append(i)

In [None]:
display(len(cols))
display(len(pos_cols))##5

In [None]:
pos_cols

In [None]:
display(sns.distplot(master_df_train_test['category_ratio_net_annual_expense'],kde=True))
display(sns.distplot(sample_sub['category_ratio_net_annual_expense'],kde=True))

In [None]:
from scipy.stats import boxcox

In [None]:
## transform ##
total_assets_t=boxcox(master_df_train_test['total_assets'])[0]
category_ratio_net_annual_expense_t=boxcox(master_df_train_test['category_ratio_net_annual_expense'])[0]
total_assets_sam_t=boxcox(sample_sub['total_assets'])[0]
category_ratio_net_annual_expense_sam_t=boxcox(sample_sub['category_ratio_net_annual_expense'])[0]

In [None]:
total_assets_t[0]

In [None]:
sns.distplot(category_ratio_net_annual_expense_t,kde=True)

In [None]:
#master_df_train_test.shape
sample_sub=pd.read_csv('sample_sub.csv')
sample_sub.drop('Unnamed: 0',axis=1,inplace=True)

In [None]:
sample_sub.shape

In [None]:
## appending the tranformed variables ##
master_df_train_test.insert(loc=129,column='total_assets_t',value=total_assets_t)
master_df_train_test.insert(loc=130,column='category_ratio_net_annual_expense_t',value=category_ratio_net_annual_expense_t)
sample_sub.insert(loc=128,column='total_assets_t',value=total_assets_sam_t)
sample_sub.insert(loc=129,column='category_ratio_net_annual_expense_t',value=category_ratio_net_annual_expense_sam_t)

In [None]:
sample_sub.head()

In [None]:
sample_sub.to_csv('sample_sub.csv')

## Decisiontree classifier 

In [None]:
dt=DecisionTreeClassifier(random_state=123)
dt.fit(X,y)

In [None]:
sample_sub=pd.read_csv('sample_sub.csv')
dt_pred=dt.predict(sample_sub[cols])

In [None]:
dt_pred

In [None]:
sample_sub.drop(['Unnamed: 0'],axis=1,inplace=True)
sample_sub['greatstone_rating']=dt_pred

In [None]:
sample_sub.head()

In [None]:
##iter 12
sample_sub[['fund_id','greatstone_rating']].to_csv('sample_submission_v12.csv',index=False)

## Xgboost

In [None]:
xgb=XGBClassifier()
xgb.fit(X,y)

In [None]:
sample_sub=pd.read_csv('sample_sub.csv')
xgb_pred=xgb.predict(sample_sub[cols])

In [None]:
xgb_pred

In [None]:
sample_sub.drop(['Unnamed: 0'],axis=1,inplace=True)
sample_sub['greatstone_rating']=xgb_pred

In [None]:
sample_sub.head()

In [None]:
##iter 13
sample_sub[['fund_id','greatstone_rating']].to_csv('sample_submission_v13.csv',index=False)

## xgboost with tuning 

In [None]:
column=['yield',
 'return_ytd',
 'years_up',
 'years_down',
 '2014_category_return',
 '2012_return_category',
 '2018_return_category',
 'category_return_1year',
 'cash_percent_of_portfolio',
 'pc_ratio',
 '2011_return_category',
 'ytd_return_fund',
 '2014_return_fund',
 'category_return_1month',
 '2013_return_fund',
 'fund_return_3months',
 'ytd_return_category',
 'pb_ratio',
 '2017_category_return',
 '1_year_return_fund',
 'pe_ratio',
 '2015_return_fund',
 'portfolio_convertable',
 '3_months_return_category',
 'portfolio_others',
 '2016_return_fund',
 'mmc',
 'stock_percent_of_portfolio',
 '2016_return_category',
 'ps_ratio',
 '2011_return_fund',
 '2010_return_fund',
 'fund_return_3years_x',
 '2012_fund_return',
 '2018_return_fund',
 '2017_return_fund',
 'category_return_2015',
 '1_month_fund_return',
 'bond_percentage_of_porfolio',
 'portfolio_preferred',
 '2010_return_category',
 '2013_category_return',
 'fund_ratio_net_annual_expense',
 'portfolio_communication_allocation',
 'portfolio_financial_services',
 'portfolio_industrials_allocation',
 'portfolio_tech_allocation',
 'portfolio_materials_basic_allocation',
 'portfolio_energy_allocation',
 'portfolio_consumer_defence_allocation',
 'portfolio_healthcare_allocation',
 'portfolio_property_allocation',
 'portfolio_utils_allocation',
 'portfolio_cyclical_consumer_allocation',
 'bb_rating',
 'us_govt_bond_rating',
 'below_b_rating',
 'others_rating',
 'maturity_bond',
 'b_rating',
 'a_rating',
 'aaa_rating',
 'aa_rating',
 'bbb_rating',
 'duration_bond',
 '3yrs_treynor_ratio_fund',
 '3_years_alpha_fund',
 '3years_category_std',
 '3yrs_sharpe_ratio_fund',
 '3yrs_treynor_ratio_category',
 '3_years_return_mean_annual_fund',
 'fund_beta_3years',
 '3years_fund_r_squared',
 '3years_fund_std',
 'category_beta_3years',
 'fund_return_3years_y',
 '3_years_alpha_category',
 '3_years_return_mean_annual_category',
 '3yrs_sharpe_ratio_category',
 '3years_category_r_squared',
 '3_years_return_category',
 'category_r_squared_5years',
 '5yrs_sharpe_ratio_fund',
 '5_years_alpha_fund',
 '5years_fund_r_squared',
 '5years_fund_std',
 '5yrs_sharpe_ratio_category',
 '5_years_beta_fund',
 '5yrs_treynor_ratio_fund',
 '5_years_return_mean_annual_fund',
 '5_years_return_mean_annual_category',
 '5yrs_treynor_ratio_category',
 '5_years_return_fund',
 '5_years_alpha_category',
 '5_years_beta_category',
 '5years_category_std',
 '5_years_return_category',
 '10years_category_r_squared',
 '10yrs_sharpe_ratio_fund',
 '10_years_alpha_fund',
 '10years_fund_r_squared',
 '10years_fund_std',
 '10yrs_sharpe_ratio_category',
 '10_years_beta_fund',
 '10yrs_treynor_ratio_fund',
 '10_years_return_mean_annual_category',
 '10yrs_treynor_ratio_category',
 '10_years_return_fund',
 '10_years_alpha_category',
 '10_years_beta_category',
 '10years_category_std',
 '10_years_return_mean_annual_fund',
 '10_years_return_category',
 'investment_class_0.0',
 'investment_class_1.0',
 'investment_class_2.0',
 'fund_size_0.0',
 'fund_size_1.0',
 'fund_size_2.0',
 'total_assets',
 'category_ratio_net_annual_expense','greatstone_rating']
train=master_df_train_test[column]

In [None]:
train.columns

In [None]:
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'multi:softmax',
 nthread=4,
 seed=27,num_class=6)
xgb1.fit(X,y)

In [None]:
sample_sub=pd.read_csv('sample_sub.csv')
xgb1_pred=xgb1.predict(sample_sub[cols])

In [None]:
xgb1_pred

In [None]:
sample_sub.drop(['Unnamed: 0'],axis=1,inplace=True)
sample_sub['greatstone_rating']=xgb1_pred

In [None]:
sample_sub.head()

In [None]:
##iter 14
sample_sub[['fund_id','greatstone_rating']].to_csv('sample_submission_v14.csv',index=False)

In [None]:
### Tuning all params ##
params={'max_depth':[3,6,9,12],
        'gamma':[0,1],
        'subsample':[0.2,0.4,0.6,0.8],
        'colsample_bytree':[0.2,0.4,0.6,0.8],
        'tree_method':['auto','hist'],
        'min_child_weight':[0,1,2]
}
xgb2=XGBClassifier(learning_rate=0.1,n_estimators=1000,objective='multi:softmax',random_state=123,booster='gbtree',seed=1)
xgb2_t=RandomizedSearchCV(xgb2,param_distributions=params,n_iter=50,cv=5,n_jobs=-1,verbose=1,random_state=123)

In [None]:
xgb2_t.fit(X,y)

In [None]:
xgb2_t.best_params_

In [None]:
xgb2=XGBClassifier()
xgb2.fit(X,y)

In [None]:
sample_sub=pd.read_csv('sample_sub.csv')
xgb2_pred=xgb2.predict(sample_sub[cols])

In [None]:
xgb2_pred

In [None]:
sample_sub.drop(['Unnamed: 0'],axis=1,inplace=True)
sample_sub['greatstone_rating']=xgb2_pred

In [None]:
##iter 15
sample_sub[['fund_id','greatstone_rating']].to_csv('sample_submission_v15.csv',index=False)

## Lightgbm classifier

In [None]:
## changing n_estimators from 100 to 1000 to 2000 keeping learning_rate constant
lgbm=LGBMClassifier(objective='LGBMClassifier',random_state=123,n_estimators=1500)
lgbm.fit(X,y)

In [None]:
sample_sub=pd.read_csv('sample_sub.csv')
lgbm_pred=lgbm.predict(sample_sub[cols])

In [None]:
lgbm_pred

In [None]:
sample_sub.drop(['Unnamed: 0'],axis=1,inplace=True)
sample_sub['greatstone_rating']=lgbm_pred

In [None]:
##iter 16 : 1000
#sample_sub[['fund_id','greatstone_rating']].to_csv('sample_submission_v16.csv',index=False)
##iter 17 : 2000
#sample_sub[['fund_id','greatstone_rating']].to_csv('sample_submission_v17.csv',index=False)
##iter 18 : 1500
sample_sub[['fund_id','greatstone_rating']].to_csv('sample_submission_v18.csv',index=False)

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,y,test_size=0.1,random_state=123)

In [None]:
fit_params={"early_stopping_rounds":30, 
            "eval_metric" : 'logloss', 
            "eval_set" : [(X_test,Y_test)],
            'eval_names': ['valid'],
            #'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_099)],
            'verbose': 100,
            'categorical_feature': 'auto'}

In [None]:
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
param_test ={'num_leaves': sp_randint(6, 50), 
             'min_child_samples': sp_randint(100, 500), 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': sp_uniform(loc=0.2, scale=0.8), 
             'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}

In [None]:
#n_estimators is set to a "large value". The actual number of trees build will depend on early stopping and 5000 define only the absolute maximum
clf =LGBMClassifier(max_depth=-1, random_state=314, silent=True, metric='None', n_jobs=4, n_estimators=5000)
gs = RandomizedSearchCV(
    estimator=clf, param_distributions=param_test, 
    n_iter=100,
    cv=3,
    refit=True,
    random_state=314,
    verbose=True)

In [None]:
gs.fit(X_train, Y_train, **fit_params)

In [None]:
gs.best_estimator_

In [None]:
## use the best params from the above search to fit the model on overall data ##
params={'colsample_bytree': 0.9234, 'min_child_samples': 399, 'min_child_weight': 0.1,
        'num_leaves': 13, 'reg_alpha': 2, 'reg_lambda': 5, 'subsample': 0.855}

In [None]:
clf.get_params()

In [None]:
lgbm_t=LGBMClassifier(learning_rate=0.04,n_estimators=5000,colsample_bytree=0.9234,min_child_samples=399, 
                      min_child_weight=0.1,num_leaves=13,reg_alpha=2,reg_lambda=5,subsample=0.855,random_state=123,
                      objective='LGBMClassifier')
lgbm_t.fit(X,y)

In [None]:
fi=lgbm_t.feature_importances_
df_fi=pd.DataFrame({'columns':X.columns,'Imp':fi})

In [None]:
df_fi.sort_values(by='Imp',ascending=False)

In [None]:
sample_sub=pd.read_csv('sample_sub.csv')
lgbm_t_pred=lgbm_t.predict(sample_sub[cols])

In [None]:
type(lgbm_t_pred)

In [None]:
sample_sub.drop(['Unnamed: 0'],axis=1,inplace=True)
sample_sub['greatstone_rating']=lgbm_t_pred

In [None]:
##iter 20 : tuned learning rate and n estimators
#sample_sub[['fund_id','greatstone_rating']].to_csv('sample_submission_v20.csv',index=False)
##iter 21 : tuned other params while keeping nestimator and learning rate fix
sample_sub[['fund_id','greatstone_rating']].to_csv('sample_submission_v21.csv',index=False)

In [None]:
## Lightgbm with tuning 
params={'learning_rate':[0.05,0.08,0.1]}
lgbm=LGBMClassifier(objective='LGBMClassifier',random_state=123)
gs=GridSearchCV(lgbm,param_grid=params,n_jobs=-1,cv=5,verbose=1)

In [None]:
gs.fit(X,y)

In [None]:
gs.best_params_

In [None]:
# learning rate tuning 
lgbm_t=LGBMClassifier(boosting_type='dart',learning_rate=0.05,n_estimators=2500,
                      objective='LGBMClassifier',reg_lambda=0.2,min_child_samples=30,
                      max_depth=-1,random_state=1111)
lgbm_t.fit(X,y)

In [None]:
sample_sub=pd.read_csv('sample_sub.csv')
lgbm_t_pred=lgbm_t.predict(sample_sub[cols])

In [None]:
lgbm_t_pred

In [None]:
sample_sub.drop(['Unnamed: 0'],axis=1,inplace=True)
sample_sub['greatstone_rating']=lgbm_t_pred

In [None]:
##iter 19 : tuned learning rate and n estimators
sample_sub[['fund_id','greatstone_rating']].to_csv('sample_submission_v19.csv',index=False)

## Catboost 

In [None]:
import catboost

In [None]:
from catboost import CatBoostClassifier, Pool

In [None]:
cbc=CatBoostClassifier(iterations=4000,random_state=123,learning_rate=0.1,depth=6,l2_leaf_reg=4.0)

In [None]:
cbc.fit(X,y,verbose=10)

In [None]:
sample_sub=pd.read_csv('sample_sub.csv')
cbc_pred=cbc.predict(sample_sub[cols],prediction_type='Class')

In [None]:
cbc_pred.reshape(5000,)

In [None]:
cbc_pred=cbc_pred.astype('int').reshape(5000,)

In [None]:
cbc_pred

In [None]:
sample_sub.drop(['Unnamed: 0'],axis=1,inplace=True)
sample_sub['greatstone_rating']=cbc_pred

In [None]:
sample_sub

In [None]:
##iter 28
sample_sub[['fund_id','greatstone_rating']].to_csv('sample_submission_v28.csv',index=False)

### ADA boost

In [None]:
ada=AdaBoostClassifier(algorithm='SAMME',n_estimators=100, learning_rate=0.1,random_state=123)
ada.fit(X,y)

In [None]:
sample_sub=pd.read_csv('sample_sub.csv')
ada_pred=ada.predict(sample_sub[cols])

In [None]:
ada_pred

In [None]:
sample_sub.drop(['Unnamed: 0'],axis=1,inplace=True)
sample_sub['greatstone_rating']=ada_pred

In [None]:
##iter 29
sample_sub[['fund_id','greatstone_rating']].to_csv('sample_submission_v29.csv',index=False)

## SVC with hyper parameter tuning 

In [None]:
## Scale the whole data ##
scaler = StandardScaler()
X_scaled= scaler.fit_transform(X)

from scipy.stats import reciprocal, uniform
param_distributions = {"gamma": reciprocal(0.001, 0.1), "C": uniform(1, 10),"kernel":['rbf','poly','sigmoid'],"degree":[3,4,5,6]}

svc=SVC(gamma='scale',random_state=123)
rs = RandomizedSearchCV(svc, param_distributions, n_iter=10, verbose=2, cv=3)
rs.fit(X_scaled,y)

In [None]:
rs.best_estimator_

In [None]:
rs.best_score_

In [None]:
rs.best_estimator_.fit(X_scaled,y)

In [None]:
sample_sub=pd.read_csv('sample_sub.csv')
sample_sub_scaler=scaler.transform(sample_sub[cols])
svc_pred=rs.best_estimator_.predict(sample_sub_scaler)

In [None]:
svc_pred

In [None]:
sample_sub.drop(['Unnamed: 0'],axis=1,inplace=True)
sample_sub['greatstone_rating']=svc_pred

In [None]:
sample_sub['greatstone_rating'].unique()

In [None]:
##iter 30
sample_sub[['fund_id','greatstone_rating']].to_csv('sample_submission_v30.csv',index=False)

## hist GBC 

In [None]:
hgbc=HistGradientBoostingClassifier(loss='categorical_crossentropy',
                                    max_iter=1000,
                                    learning_rate=0.08,
                                    max_bins=200,
                                    random_state=123)
hgb=hgbc.fit(X,y)

In [None]:
params={'loss':['categorical_crossentropy','auto'],
        'learning_rate':[0.05,0.1,0.15],
        'max_iter':[100,500,1000,1200],
        'max_depth':[3,6,9],
        'min_samples_leaf':[20,40,60],
        'l2_regularization':[0,0.2,0.4]}
hgbc=HistGradientBoostingClassifier(random_state=123,validation_fraction=0.1,n_iter_no_change=20)
rs=RandomizedSearchCV(hgbc,param_distributions=params,n_iter=100,cv=3,verbose=10,random_state=123)
rs.fit(X,y)

In [None]:
rs.best_params_

In [None]:
hgbc_t=HistGradientBoostingClassifier(random_state=123,min_samples_leaf=40,max_iter=1000,max_depth=9,
                                    loss='auto',learning_rate=0.15,l2_regularization=0.2)
hgbc_t.fit(X,y)

In [None]:
sample_sub=pd.read_csv('sample_sub.csv')
hgbc_t_pred=hgbc_t.predict(sample_sub[cols])

In [None]:
hgbc_t_pred

In [None]:
sample_sub.drop(['Unnamed: 0'],axis=1,inplace=True)
sample_sub['greatstone_rating']=hgbc_t_pred

In [None]:
##iter 31
sample_sub[['fund_id','greatstone_rating']].to_csv('sample_submission_v31.csv',index=False)

## Voting Classifier 

In [None]:
#rf=RandomForestClassifier(n_estimators=100,random_state=123)
lgbm_t=LGBMClassifier(boosting_type='dart',learning_rate=0.05,n_estimators=2500,
                      objective='LGBMClassifier',reg_lambda=0.2,min_child_samples=30,
                      max_depth=-1,random_state=1111)
xgb=XGBClassifier(learning_rate=0.1,n_estimators=4000,reg_lambda=0.6,random_state=123)
#cb=CatBoostClassifier(iterations=4000,learning_rate=0.009,l2_leaf_reg=4,random_state=123)
vc=VotingClassifier([('lgbm',lgbm),('xgb',xgb)],voting='soft')
vc.fit(X,y)

In [None]:
sample_sub=pd.read_csv('sample_sub.csv')
vc_pred=vc.predict(sample_sub[cols])

In [None]:
vc_pred

In [None]:
sample_sub.drop(['Unnamed: 0'],axis=1,inplace=True)
sample_sub['greatstone_rating']=vc_pred

In [None]:
##iter 32
sample_sub[['fund_id','greatstone_rating']].to_csv('sample_submission_v32.csv',index=False)