# Importing Necessary libraries

In [4]:
import warnings 
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import fbeta_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# load data

In [6]:
train_path=r'C:\\Users\\akhil\\Downloads\\ML projects\\carvan_train.csv'
test_path=r'C:\\Users\\akhil\\Downloads\\ML projects\\carvan_test.csv'

cd_train=pd.read_csv(train_path)
cd_test=pd.read_csv(test_path)

In [7]:
target='V86'

In [8]:
x_train=cd_train.drop(target,axis=1)
y_train=cd_train[target]

In [10]:
# Identifying categorical features from the data dictionary
categorical_features=[col for col in x_train.columns if 'L0'in col or 'L2' in col]

In [11]:
x_train=pd.get_dummies(x_train,columns=categorical_features,drop_first=True)
x_test=pd.get_dummies(cd_test,columns=categorical_features,drop_first=True)

In [None]:
x_train,x_test=x_train.align(x_test,join='inner',axis=1)

# Balancing the imbalanced data

In [12]:
smote=SMOTE(random_state=42)
x_train_smote,y_train_smote=smote.fit_resample(x_train,y_train)

In [13]:
scaler=StandardScaler()
x_train_smote=scaler.fit_transform(x_train_smote)
x_test_scaled=scaler.transform(x_test)

# selecting best model

In [14]:
rf=RandomForestClassifier(random_state=42)

# Hyper parameter tuning

In [16]:
param_grid={
    'n_estimators':[100,200,300],
    'max_depth':[5,10,15,None],
    'min_samples_split':[2,5,10],
    'min_samples_leaf': [1,2,4],
    'class_weight':['balanced',None]
}

In [18]:
random_search=RandomizedSearchCV(rf,param_distributions=param_grid,scoring='roc_auc',n_iter=50,cv=5,verbose=2,random_state=42,n_jobs=1)
random_search.fit(x_train_smote,y_train_smote)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END class_weight=None, max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   1.1s
[CV] END class_weight=None, max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   1.1s
[CV] END class_weight=None, max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   1.2s
[CV] END class_weight=None, max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   1.1s
[CV] END class_weight=None, max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   1.1s
[CV] END class_weight=None, max_depth=None, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   0.9s
[CV] END class_weight=None, max_depth=None, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   1.0s
[CV] END class_weight=None, max_depth=None, min_samples_leaf=4, min_samples_split=

In [19]:
best_model=random_search.best_estimator_

In [20]:
train_probs=best_model.predict_proba(x_train_smote)[:,1]
cutoffs=np.linspace(0.001,0.999,999)
fbetas=[fbeta_score(y_train_smote,(train_probs>cutoff).astype(int),beta=2) for cutoff in cutoffs]
optimal_cutoff=cutoffs[np.argmax(fbetas)]

In [21]:
test_probs=best_model.predict_proba(x_test_scaled)[:,1]
predictions=(test_probs>optimal_cutoff).astype(int)

In [22]:
submission=pd.DataFrame({'V86':predictions})
submission.to_csv('sample_submission_carvan.csv',index=False)

In [32]:
print('optimal Cutoff:',optimal_cutoff)
print('Best F-beta Score on Train:',max(fbetas))

optimal Cutoff: 0.339
Best F-beta Score on Train: 0.9888574332171893
