In [None]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle competitions download -c porto-seguro-safe-driver-prediction

In [None]:
!unzip '/content/porto-seguro-safe-driver-prediction.zip'

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_rows',None)
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
import xgboost as xgb
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer

In [None]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

In [None]:
print('Shape of Training Dataset:',train.shape)
print('Shape of Test Dataset:',test.shape)


combining train and test

In [None]:
df0=[train,test]
df=pd.concat(df0).reset_index(drop=True)

In [None]:
print('Shape of Combined dataframe: ',df.shape)

Target values

In [None]:
print('List of values in target feature: ' ,df['target'].unique())
print('Count of values in target feature: \n',df['target'].value_counts())

Plotting Target feature

In [None]:
plt.figure(figsize=(8, 6))
train['target'].value_counts().plot(kind='bar',color=['blue','orange'])
plt.title('Distribution of Target')
plt.xlabel('Target Value')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()

Dataset is highly imbalanced,so either we can balance it using sampling or we can conduct strong feature engineering.I prefer second.

Replace -1 which is in the dataframe for missing values.we need to replace it with "nan"

In [None]:
df.replace(-1, np.nan, inplace=True)

checking for null values

In [None]:
df.isnull().sum()


Here two features has more than or almost 50% missing values,we can remove those features

In [None]:
df.drop(columns=['ps_car_03_cat','ps_car_05_cat'],inplace=True)


for remaining missing values,we can impute with mean for continous features and with mode for categorical.binary features does not have missing values,so we dont have to impute.

In [None]:
categorical_features_to_impute=['ps_ind_02_cat','ps_ind_04_cat','ps_ind_05_cat','ps_car_01_cat','ps_car_02_cat','ps_car_07_cat','ps_car_09_cat']
continous_features_to_impute=['ps_reg_03','ps_car_11','ps_car_12','ps_car_14']


In [None]:
for feature in continous_features_to_impute:
    df[feature].fillna(df[feature].mean(), inplace=True)


for feature in categorical_features_to_impute:
    mode_val = df[feature].mode()[0]
    df[feature].fillna(mode_val, inplace=True)



In [None]:
print('Shape of Combined dataframe: ',df.shape)

In [None]:
df.isnull().sum()

Now there is no missing values,but since the dataset is very big,we cannot train it using all these features,so we need carefully do the feature engineering

**Feature Engineering**

plot importance of features

In [None]:

train_data = df[df['target'].notnull()]
test_data = df[df['target'].isnull()]


X_train = train_data.drop(columns=['target'])
y_train = train_data['target']
X_test = test_data.drop(columns=['target'])


rf_clf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
rf_clf.fit(X_train, y_train)

feature_importance = rf_clf.feature_importances_


feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importance})


feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df, order=feature_importance_df.sort_values('Importance', ascending=False)['Feature'])
plt.title('Feature Importance - Random Forest')
plt.xlabel('Importance Score')
plt.ylabel('Features')
plt.show()



In [None]:
training_list = feature_importance_df['Feature'].tolist()

In [None]:
df_selected = df[training_list]
plt.figure(figsize=(30, 30))
sns.heatmap(df_selected.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap of Selected Features')
plt.show()

we can see that the feature which ends with 'calc' has almost zero corelation with other features,so we can drop it

In [None]:
training_subset = feature_importance_df[~feature_importance_df['Feature'].str.contains('calc')]

In [None]:
training_list2 = training_subset['Feature'].tolist()

Let's train the model

In [None]:
X=train[training_list2].copy()
y=train['target']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
def gini_score(y_true, y_prob):
    return 2 * roc_auc_score(y_true, y_prob) - 1

xgb_clf = xgb.XGBClassifier()
lgbm_clf = LGBMClassifier()


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


xgb_train_scores = []
xgb_valid_scores = []
lgbm_train_scores = []
lgbm_valid_scores = []


for train_index, valid_index in cv.split(X, y):
    X_train_fold, X_valid_fold = X.iloc[train_index], X.iloc[valid_index]
    y_train_fold, y_valid_fold = y.iloc[train_index], y.iloc[valid_index]

    xgb_clf.fit(X_train_fold, y_train_fold)
    xgb_train_pred_proba = xgb_clf.predict_proba(X_train_fold)[:, 1]
    xgb_valid_pred_proba = xgb_clf.predict_proba(X_valid_fold)[:, 1]
    xgb_train_score = gini_score(y_train_fold, xgb_train_pred_proba)
    xgb_valid_score = gini_score(y_valid_fold, xgb_valid_pred_proba)
    xgb_train_scores.append(xgb_train_score)
    xgb_valid_scores.append(xgb_valid_score)

    lgbm_clf.fit(X_train_fold, y_train_fold)
    lgbm_train_pred_proba = lgbm_clf.predict_proba(X_train_fold)[:, 1]
    lgbm_valid_pred_proba = lgbm_clf.predict_proba(X_valid_fold)[:, 1]
    lgbm_train_score = gini_score(y_train_fold, lgbm_train_pred_proba)
    lgbm_valid_score = gini_score(y_valid_fold, lgbm_valid_pred_proba)
    lgbm_train_scores.append(lgbm_train_score)
    lgbm_valid_scores.append(lgbm_valid_score)

mean_xgb_train_score = np.mean(xgb_train_scores)
mean_xgb_valid_score = np.mean(xgb_valid_scores)
mean_lgbm_train_score = np.mean(lgbm_train_scores)
mean_lgbm_valid_score = np.mean(lgbm_valid_scores)

print("Mean XGBoost training gini score:", mean_xgb_train_score)
print("Mean XGBoost validation gini score:", mean_xgb_valid_score)
print("Mean LightGBM training gini score:", mean_lgbm_train_score)
print("Mean LightGBM validation gini score:", mean_lgbm_valid_score)

Predict unseen data and make the submission file

In [None]:
unseen_data=X_test[training_list2]
xgb_unseen_pred_proba = xgb_clf.predict_proba(unseen_data)[:, 1]
lgbm_unseen_pred_proba = lgbm_clf.predict_proba(unseen_data)[:, 1]
predictions_df = pd.DataFrame({
    'id': unseen_data['id'],
    'target_xgb': xgb_unseen_pred_proba,
    'target_lgbm': lgbm_unseen_pred_proba
})
predictions_df['target'] = (predictions_df['target_xgb'] + predictions_df['target_lgbm']) / 2
predictions_df[['id', 'target']].to_csv('predictions.csv', index=False)