In [None]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle competitions download -c porto-seguro-safe-driver-prediction

In [None]:
!unzip '/content/porto-seguro-safe-driver-prediction.zip'

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('display.max_rows',None)
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score

In [None]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

In [None]:
print('Shape of Training Dataset:',train.shape)
print('Shape of Test Dataset:',test.shape)


Target values

In [None]:
print('List of values in target feature: ' ,train['target'].unique())
print('Count of values in target feature: \n',train['target'].value_counts())

Plotting Target feature

In [None]:
plt.figure(figsize=(8, 6))
train['target'].value_counts().plot(kind='bar',color=['blue','orange'])
plt.title('Distribution of Target')
plt.xlabel('Target Value')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()

Dataset is highly imbalanced

We should balance it using SMOTE and shuffle the dataframe

In [None]:
x = train.drop(columns=['target'])
y= train['target']


sm = SMOTE(random_state=12, sampling_strategy=1.0)

x1, y1= sm.fit_resample(x, y)

oversampled_df = pd.concat([pd.DataFrame(x1, columns=x.columns), pd.Series(y1, name='target')], axis=1)

oversampled_df = oversampled_df.sample(frac=1).reset_index(drop=True)



In [None]:
oversampled_df['target'].value_counts()

Class imbalance is solved

dealing 5 lakhs of rows requires high computational resources,So I will take only 15000 samples.

In [None]:
sample_size = 15000
sample_df = oversampled_df.sample(n=sample_size, random_state=42)

In [None]:
plt.figure(figsize=(8, 6))
sample_df['target'].value_counts().plot(kind='bar',color=['blue','orange'])
plt.title('Distribution of Target')
plt.xlabel('Target Value')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()

Replace -1 which is in the dataframe for missing values.we need to replace is with "nan"

In [None]:
df=sample_df .copy()

In [None]:
df.replace(-1, np.nan, inplace=True)
test.replace(-1, np.nan, inplace=True)


checking for null values

In [None]:
df.isnull().sum()


In [None]:
df.drop(columns=['ps_car_03_cat','ps_car_05_cat'],inplace=True)


In [None]:
test.drop(columns=['ps_car_03_cat','ps_car_05_cat'],inplace=True)

In [None]:
df.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
categorical_features_to_impute=['ps_ind_02_cat','ps_ind_04_cat','ps_ind_05_cat','ps_car_01_cat','ps_car_02_cat','ps_car_07_cat','ps_car_09_cat']
continous_features_to_impute=['ps_reg_03','ps_car_14']


In [None]:
continous_features_to_impute_test=['ps_reg_03','ps_car_14','ps_car_11']

In [None]:
for feature in continous_features_to_impute:
    df[feature].fillna(df[feature].mean(), inplace=True)


for feature in categorical_features_to_impute:
    mode_val = df[feature].mode()[0]
    df[feature].fillna(mode_val, inplace=True)



In [None]:
for feature in continous_features_to_impute_test:
    test[feature].fillna(test[feature].mean(), inplace=True)
for feature in categorical_features_to_impute:
    mode_val = test[feature].mode()[0]
    test[feature].fillna(mode_val, inplace=True)

In [None]:
df.isnull().sum()

In [None]:
test.isnull().sum()

make copies of train and test data

In [None]:
df0=df.copy()
test0=test.copy()

In [None]:
df0.drop(columns=['id'],inplace=True)

In [None]:
test0.drop(columns=['id'],inplace=True)

In [None]:
print('df0 shape:', df0.shape)
print('test0 shape:', test0.shape)

In [None]:
y = df0['target']
X=df0.drop(columns=['target'])

In [None]:
print('y shape:', y.shape)
print('X shape:',  X.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.8, test_size=0.2,random_state=0,stratify=y)

In [None]:
# Define the parameter grid for each model
param_grids = {
    'Random Forest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 5, 10]
    },
    'Gradient Boosting': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.5]
    },
    'Decision Tree': {
        'max_depth': [None, 5, 10],
        'min_samples_split': [2, 5, 10]
    }
}

# Train each model with GridSearchCV
trained_models_gs = {}
for name, model in models.items():
    if name in param_grids:
        grid_search = GridSearchCV(model, param_grids[name], scoring='roc_auc', cv=5)
        grid_search.fit(X_train, y_train)
        trained_models_gs[name] = grid_search.best_estimator_
    else:
        model.fit(X_train, y_train)
        trained_models_gs[name] = model

# 3. Evaluation
# Same evaluation as before

# 4. Gini Score Calculation
auc_scores_gs = {}
for name, model in trained_models_gs.items():
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    auc_scores_gs[name] = roc_auc_score(y_test, y_pred_proba)

# Calculate Gini score
gini_scores_gs = {name: 2 * auc - 1 for name, auc in auc_scores_gs.items()}

# 5. Model Selection
best_model_name_gs = max(gini_scores_gs, key=gini_scores_gs.get)
best_model_gs = trained_models_gs[best_model_name_gs]

print("Best Model with GridSearchCV:", best_model_name_gs)
print("Gini Score of Best Model with GridSearchCV:", gini_scores_gs[best_model_name_gs])


In [None]:
sample_id = test['id']

# Make predictions on the sample
predictions = model.predict(test0)

# Create a dataframe with 'id' and 'target' columns
predictions_df = pd.DataFrame({'id': sample_id, 'target': predictions})

# Save predictions to a CSV file
predictions_df.to_csv('predictions.csv', index=False)