<a href="https://colab.research.google.com/github/alishba-cmd/RhombixTechnologies_Tasks/blob/main/InternshipTask3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Install imbalanced-learn
!pip install imbalanced-learn

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

# Load the datasets
application = pd.read_csv('/content/application_record.csv')
credit = pd.read_csv('/content/credit_record.csv')

# Create target variable: 1 (Bad) if any status >= 2, else 0 (Good)
credit['STATUS'] = credit['STATUS'].replace({'C': 0, 'X': 0, '0': 0, '1': 1, '2': 2, '3': 3, '4': 4, '5': 5})
credit['is_bad'] = credit.groupby('ID')['STATUS'].transform(lambda x: 1 if any(x >= 2) else 0)
credit = credit[['ID', 'is_bad']].drop_duplicates()

# Merge datasets
data = application.merge(credit, on='ID', how='inner')

# Define features and target
X = data.drop(['ID', 'is_bad'], axis=1)
y = data['is_bad']

# Print initial class distribution
print("Initial Class Distribution:")
print(y.value_counts(normalize=True))

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocessing pipeline
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Model pipeline with Logistic Regression
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000))
])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Handle class imbalance with SMOTE
smote = SMOTE(sampling_strategy=0.2, random_state=42)  # 1:5 minority:majority ratio
X_train_resampled, y_train_resampled = smote.fit_resample(preprocessor.fit_transform(X_train), y_train)

# Print class distribution after SMOTE
print("\nClass Distribution After SMOTE:")
print(pd.Series(y_train_resampled).value_counts(normalize=True))

# Hyperparameter optimization with GridSearchCV
param_grid = {
    'classifier__C': [0.01, 0.1, 1, 10, 100],
    'classifier__penalty': ['l2'],  # Avoid L1 to prevent zero coefficients
    'classifier__solver': ['liblinear']
}
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Evaluate the best model
best_model = grid_search.best_estimator_
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# Tune decision threshold to maximize F1-score
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[optimal_idx]
y_pred = (y_pred_proba >= optimal_threshold).astype(int)

# Print results
print("\nBest Parameters:", grid_search.best_params_)
print("Optimal Threshold:", optimal_threshold)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_proba))

# Feature importance (coefficients)
feature_names = numerical_cols + best_model.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_cols).tolist()
coef = best_model.named_steps['classifier'].coef_[0]
print("\nTop 5 Features by Importance:")
for name, imp in sorted(zip(feature_names, coef), key=lambda x: abs(x[1]), reverse=True)[:5]:
    print(f"{name}: {imp:.4f}")

Initial Class Distribution:
is_bad
0    0.983103
1    0.016897
Name: proportion, dtype: float64

Class Distribution After SMOTE:
is_bad
0    0.833343
1    0.166657
Name: proportion, dtype: float64

Best Parameters: {'classifier__C': 0.1, 'classifier__penalty': 'l2', 'classifier__solver': 'liblinear'}
Optimal Threshold: 0.03451901147821503

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.98      0.98      7169
           1       0.12      0.14      0.13       123

    accuracy                           0.97      7292
   macro avg       0.55      0.56      0.55      7292
weighted avg       0.97      0.97      0.97      7292

ROC-AUC Score: 0.558505058477841

Top 5 Features by Importance:
NAME_INCOME_TYPE_Pensioner: 0.9290
NAME_INCOME_TYPE_State servant: -0.8415
FLAG_OWN_REALTY_Y: -0.6346
NAME_FAMILY_STATUS_Married: -0.6311
CODE_GENDER_F: -0.6267
