# Downloading dataset from Kaggle

In [None]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle competitions download -c santander-customer-transaction-prediction

In [None]:
!unzip '/content/santander-customer-transaction-prediction.zip'

# Importing modules

In [None]:
import pandas as pd
pd.set_option('display.max_rows',None)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
!pip install catboost
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score


# Loading data to dataframe

In [None]:
train=pd.read_csv('train.csv')
test0=pd.read_csv('test.csv')


In [None]:
df=train.copy()
test=test0.copy()

# going through the data

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.dtypes

**No object datatype and only continous features**

# checking for null values

In [None]:
df.isnull().sum()

**No null values**

Plotting target variable

In [None]:
sns.histplot(df['target'], kde=True, color='blue')
plt.title('Target')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

data is highly imbalanced

In [None]:
numeric=df.select_dtypes(include=['float64', 'int64']).columns.tolist()

**Plotting heatmap of continous variables**

In [None]:
plt.figure(figsize=(30, 30))
sns.heatmap(df[numeric].corr(), annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Matrix')
plt.show()

In [None]:
df.describe()

Plotting distribution graphs of features

In [None]:
plt.figure(figsize=(100,100))
for index,feature in enumerate(numeric[0:202]):
  plt.subplot(41, 5, index+1)
  sns.histplot(df[feature], kde=True, color='blue')
  plt.title(f'Distribution of {feature}')
  plt.xlabel(f'{feature}')
  plt.ylabel('Frequency')
  plt.grid(True)
plt.show()

almost all features follow normal distribution but some of the features have skewed distribution

In [None]:

plt.figure(figsize=(100,100))
for index,feature in enumerate(numeric[0:202]):
  plt.subplot(41, 5, index+1)
  sns.boxplot(data=df[feature])
  plt.xlabel(feature)
  plt.ylabel('Value')
plt.show()


we can see there are outliers,but removing outliers is risky since our dataset is highly imbalanced.

# Finding feature importance

first lets drop the id code feature

In [None]:
df0=df.drop(columns=['ID_code'])
test3=test.drop(columns=['ID_code'])


In [None]:
X = df0.drop(columns=['target'])
y = df0['target']

In [None]:
X.head()

# Scaling

In [None]:
def scaling(df):
  scaler=MinMaxScaler()
  return pd.DataFrame(scaler.fit_transform(df),columns=df.columns)

In [None]:
scaled_df=scaling(X)
scaled_test=scaling(test3)

# Traning with RF CLS

In [None]:

X_train,X_test,y_train,y_test=train_test_split(scaled_df,y,stratify=y,test_size=0.2,random_state=42)

In [None]:
rf_clf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
rf_clf.fit(X_train, y_train)

feature_importance = rf_clf.feature_importances_


feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importance})


feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(20, 20))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df, order=feature_importance_df.sort_values('Importance', ascending=False)['Feature'])
plt.title('Feature Importance - Random Forest')
plt.xlabel('Importance Score')
plt.ylabel('Features')
plt.show()

In [None]:
y_pred_proba = rf_clf.predict_proba(X_test)[:,1]
roc_auc = roc_auc_score(y_test, y_pred_proba)

print("ROC-AUC Score:", roc_auc)

lets remove some features and check roc again

In [None]:
picked_features = []

for feature, importance_score in zip(X_train.columns, feature_importance):
    if importance_score >= 0.0040:#train the model by changing this value to get highest roc
        picked_features.append(feature)


In [None]:
rf_clf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
rf_clf.fit(X_train[picked_features], y_train)


In [None]:
y_pred_proba = rf_clf.predict_proba(X_test[picked_features])[:,1]
roc_auc = roc_auc_score(y_test, y_pred_proba)

print("ROC-AUC Score:", roc_auc)

# Training the model

lets use the following models
* Logistic Regression
* Naives Bayes
* Decision Tree
* XGboost
* Catboost
* LightGBM

In [None]:
logistic_reg = LogisticRegression(max_iter=1000)
naive_bayes = GaussianNB()
decision_tree = DecisionTreeClassifier()
xgboost_clf = XGBClassifier()
catboost_clf = CatBoostClassifier()
lightgbm_clf = LGBMClassifier()

# Define parameter grids for GridSearchCV
param_grids = {
    "Logistic Regression": {"C": [0.1, 1.0, 10.0]},
    "Naive Bayes": {},  # No hyperparameters for Naive Bayes
    "Decision Tree": {"max_depth": [None, 5, 10, 20]},
    "XGBoost": {"n_estimators": [50, 100, 200], "max_depth": [3, 5, 7]},
    "CatBoost": {"iterations": [50, 100, 200], "depth": [4, 6, 8]},
    "LightGBM": {"n_estimators": [50, 100, 200], "max_depth": [3, 5, 7]}
}

classifiers = {
    "Logistic Regression": logistic_reg,
    "Naive Bayes": naive_bayes,
    "Decision Tree": decision_tree,
    "XGBoost": xgboost_clf,
    "CatBoost": catboost_clf,
    "LightGBM": lightgbm_clf
}
roc_scores={}
# Train and evaluate each classifier with GridSearchCV
for name, clf in classifiers.items():
    print(f"Training {name}...")
    if name in param_grids:
        param_grid = param_grids[name]
        grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='roc_auc')
        grid_search.fit(X_train[picked_features], y_train)
        clf = grid_search.best_estimator_
    else:
        clf.fit(X_train[picked_features], y_train)
    y_pred = clf.predict(X_test[picked_features])
    y_pred_proba = clf.predict_proba(X_test[picked_features])[:,1]
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    roc_scores[name]=roc_auc
print(roc_scores)

Catboost gave the best roc_auc.we will pick catboost to predict our unseen data's target

In [None]:
test_predictions = grid_search.best_estimator_.predict(scaled_test[picked_features])
submission_df = pd.DataFrame({'ID_code': test0['ID_code'], 'Target': test_predictions})

submission_df.to_csv('submission.csv', index=False)



we can improve the roc by changing the parameters,but since the data is big and training takes too long,i am stopping it here