In [7]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

## Read Data
base_path = './'


train = pd.read_table(f'{base_path}/orange_small_train.data').replace('\\', '/')

test= pd.read_csv(f'{base_path}/orange_small_train_upselling.labels', header=None)
print(train.shape, test.shape)


# ## Data Cleaning


train_dtypes = train.dtypes
float_dtypes = train_dtypes[train_dtypes =='float64'].index.values.tolist()
object_dtypes = train_dtypes[train_dtypes =='object'].index.values.tolist()
print(len(float_dtypes),len(object_dtypes))



na_vals = train.isna().sum() > (train.shape[0]*0.30) # Atleast 70% values should be available
removed_features = (na_vals[na_vals==True]).index.values
train = train[(na_vals[na_vals==False]).index.values]
print('Removed features: ', removed_features)

train_dtypes = train.dtypes
print(train_dtypes)
num_var = train_dtypes[train_dtypes =='float64'].index.values.tolist()
cat_vars = train_dtypes[train_dtypes =='object'].index.values.tolist()
print(len(num_var),len(cat_vars))


# Numerical variables
for col in num_var: 
    train[col] = train[col].fillna(train[col].mean())

for col in cat_vars:
    train[col] = train[col].astype('category')
    train[col] = train[col].cat.add_categories('MISSED')
    train[col] = train[col].fillna('MISSED')


#for col in cat_vars:
#    print(col," categories :", train[col].nunique())
#    train[col].value_counts().plot()
#    plt.show()



for col in cat_vars:
    print(col," categories :", train[col].nunique())


def analyse_cats(cat):
    val_cnts=cat.value_counts()
    levs = [0,50, 100, 1000]
    for i in range(1,len(levs)):
        print(levs[i],":",((val_cnts<=levs[i])&(val_cnts>levs[i-1])).sum())
    print(levs[i],":",((val_cnts>levs[-1])).sum())

for col in cat_vars:
    if  train[col].nunique() >= 100:
        print(col," categories :", train[col].nunique())
        analyse_cats(train[col])


# As we can see there are a lot of categories that have count less than 50,100 and 1000 , we can combine all these in different categories


for col in cat_vars:
    if  train[col].nunique() >= 100:
        print(col," categories :", train[col].nunique())
        col_val_cnt = train[col].value_counts()
        lev_names= ['l','m','h','vh']
        levs = [0,50, 100, 1000, np.inf]
        lev_cat_names =[]
        for i in range(1,len(levs)):
            print(levs[i],":",((col_val_cnt<=levs[i])&(col_val_cnt>levs[i-1])).sum())
            cat_to_be_replaced = col_val_cnt[(col_val_cnt<=levs[i])&(col_val_cnt>levs[i-1])].index.values.tolist()
            lev_cat_names.append(cat_to_be_replaced)
        mapping = {}
        for i in range(len(lev_cat_names)):
            for cat in lev_cat_names[i]:
                mapping[cat]=lev_names[i]
        train[col]=train[col].map(mapping)

print('After reducing the number of categories:')
cnt=0
for col in cat_vars:
    cnt += (train[col].nunique()-1)
    print(col," categories :", train[col].nunique())
print(cnt)



# One hot encode the category variables 
train = pd.get_dummies(train)

test[0]= test[0].map({-1:0,1:1})


# ### Train and testing 

from sklearn.metrics import f1_score,accuracy_score

X,Y = train ,test

sns.countplot(Y[0])
plt.show()

from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 2)
X, Y = sm.fit_resample(X, Y)
sns.countplot(Y[0])
plt.show()

# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(X, Y, test_size = 0.2, random_state = 42)
X_train, X_valid, y_train, y_valid = train_test_split(train_features, train_labels, test_size = 0.2, random_state = 42)

print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Train Features Shape:', X_train.shape)
print('Train Labels Shape:', y_train.shape)
print('Validation Features Shape:', X_valid.shape)
print('Validation Labels Shape:', y_valid.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

from sklearn.metrics import confusion_matrix,classification_report
def evaluate_model(rf, X, y):
    y_pred =(rf.predict(X) >0.5).astype(int)
    print(classification_report(y, y_pred))

# Import the model we are using
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier

# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 50, random_state = 42,n_jobs=-1)
# Train the model on training data
rf.fit(X_train, y_train)

# Use the forest's predict method on the valdation data
print('Validation Data')
evaluate_model(rf, X_valid, y_valid)
print('Training Data')
# Use the forest's predict method on the train data
evaluate_model(rf, X_train, y_train)

f_i = list(zip(train.columns,rf.feature_importances_))
f_i.sort(key = lambda x : x[1],reverse=True)

feat_imp=pd.DataFrame(f_i)
feat_imp['imp']=feat_imp[1].cumsum()
feat_imp=feat_imp[feat_imp['imp'] <=0.95]
#feat_imp['imp'].plot()
#plt.show()


choosen_feat = feat_imp[0].tolist()

#NOw we have choosen the important features , we can start training our final model

# ## Choose parameter using cross validation and grid search

from sklearn.model_selection import GridSearchCV
n_estimators = [40]
max_depth = [10,30, 40]
min_samples_split = [2,5, 10]
min_samples_leaf = [2,5, 10] 

hyperF = dict(n_estimators = n_estimators, max_depth = max_depth,  
              min_samples_split = min_samples_split, 
             min_samples_leaf = min_samples_leaf)

forest=RandomForestRegressor(random_state = 1) 
gridF = GridSearchCV(forest, hyperF, cv = 5, verbose = 3, n_jobs = -1,scoring='f1')
bestF = gridF.fit(X_train[choosen_feat], y_train)


best_params= pd.DataFrame(gridF.cv_results_).sort_values('rank_test_score',ascending =False).iloc[0]['params']
print('Best Params: ', best_params)


# Use the forest's predict method on the valdation data
print('Validation Data')
evaluate_model(bestF, X_valid[choosen_feat], y_valid)
print('Training Data')
# Use the forest's predict method on the train data
evaluate_model(bestF, X_train[choosen_feat], y_train)

best_rf_clf = bestF.best_estimator_


# # Final model on the whole dataset and its performance


# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report
# Instantiate model with 1000 decision trees
rf = best_rf_clf
# Train the model on training data
rf.fit(train_features[choosen_feat],train_labels)


print('Final Results on Testing data')
evaluate_model(rf, test_features[choosen_feat], test_labels)

(50000, 230) (50000, 1)
191 38
Removed features:  ['Var1' 'Var2' 'Var3' 'Var4' 'Var5' 'Var8' 'Var9' 'Var10' 'Var11' 'Var12'
 'Var14' 'Var15' 'Var16' 'Var17' 'Var18' 'Var19' 'Var20' 'Var23' 'Var26'
 'Var27' 'Var29' 'Var30' 'Var31' 'Var32' 'Var33' 'Var34' 'Var36' 'Var37'
 'Var39' 'Var40' 'Var41' 'Var42' 'Var43' 'Var45' 'Var46' 'Var47' 'Var48'
 'Var49' 'Var50' 'Var51' 'Var52' 'Var53' 'Var54' 'Var55' 'Var56' 'Var58'
 'Var59' 'Var60' 'Var61' 'Var62' 'Var63' 'Var64' 'Var66' 'Var67' 'Var68'
 'Var69' 'Var70' 'Var71' 'Var72' 'Var75' 'Var77' 'Var79' 'Var80' 'Var82'
 'Var84' 'Var86' 'Var87' 'Var88' 'Var89' 'Var90' 'Var91' 'Var92' 'Var93'
 'Var94' 'Var95' 'Var96' 'Var97' 'Var98' 'Var99' 'Var100' 'Var101'
 'Var102' 'Var103' 'Var104' 'Var105' 'Var106' 'Var107' 'Var108' 'Var110'
 'Var111' 'Var114' 'Var115' 'Var116' 'Var117' 'Var118' 'Var120' 'Var121'
 'Var122' 'Var124' 'Var127' 'Var128' 'Var129' 'Var130' 'Var131' 'Var135'
 'Var136' 'Var137' 'Var138' 'Var139' 'Var141' 'Var142' 'Var145' 'Var146'
 'Var1