In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import (roc_auc_score, confusion_matrix, roc_curve,
                            precision_recall_curve, average_precision_score)
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import shap
from sklearn.model_selection import StratifiedKFold, cross_val_score

try:
    plt.style.use('seaborn-v0_8')
except:
    plt.style.use('seaborn')
sns.set_theme(style="whitegrid", palette="husl")

data = pd.read_csv('/content/train.csv')

def clean_data(df):
    missing_values = ['NA', 'N/A', 'NaN', np.nan, '', ' ', '_', '-', 'unknown', 'Unknown']
    df = df.replace(missing_values, np.nan)

    numeric_cols = ['Age', 'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts',
                   'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan', 'Delay_from_due_date',
                   'Num_of_Delayed_Payment', 'Changed_Credit_Limit', 'Num_Credit_Inquiries',
                   'Outstanding_Debt', 'Credit_Utilization_Ratio', 'Total_EMI_per_month',
                   'Amount_invested_monthly', 'Monthly_Balance']

    for col in numeric_cols:
        if col in df.columns:
            df[col] = df[col].astype(str).str.replace(r'[^0-9.-]', '', regex=True)
            df[col] = pd.to_numeric(df[col], errors='coerce')

    def convert_history(history):
        if pd.isna(history):
            return np.nan
        try:
            if isinstance(history, (int, float)):
                return history
            history = str(history).lower()
            if 'years' in history and 'months' in history:
                years = int(history.split('years')[0].strip())
                months = int(history.split('and')[1].split('months')[0].strip())
                return years * 12 + months
            elif 'years' in history:
                return int(history.split('years')[0].strip()) * 12
            else:
                return np.nan
        except:
            return np.nan

    if 'Credit_History_Age' in df.columns:
        df['Credit_History_Age_Months'] = df['Credit_History_Age'].apply(convert_history)
        df.drop('Credit_History_Age', axis=1, inplace=True)

    if 'Payment_Behaviour' in df.columns:
        df['Payment_Behaviour'] = df['Payment_Behaviour'].str.replace('!@9#%8', 'Unknown')
        df['Payment_Behaviour'] = df['Payment_Behaviour'].str.replace('__', '_')
        df['Payment_Behaviour'] = df['Payment_Behaviour'].str.replace(' ', '_')
        df['Payment_Behaviour'] = df['Payment_Behaviour'].str.lower()

    if 'Credit_Mix' in df.columns:
        df['Credit_Mix'] = df['Credit_Mix'].replace(['_', 'Standard', 'standard'], 'Good')
        df['Credit_Mix'] = df['Credit_Mix'].str.lower()

    threshold = len(df) * 0.7
    df = df.dropna(thresh=threshold, axis=1)

    for col in df.columns:
        if df[col].dtype == 'object':
            if df[col].isna().sum() > 0:
                df[col] = df[col].fillna('missing')
        else:
            if df[col].isna().sum() > 0:
                df[f'{col}_missing'] = df[col].isna().astype(int)
                df[col] = df[col].fillna(df[col].median())

    return df

data = clean_data(data)

def create_target(df):
    df['Payment_Score'] = 0

    payment_map = {
        'low_spent_small_value_payments': 2,
        'low_spent_medium_value_payments': 1,
        'low_spent_large_value_payments': 0,
        'high_spent_small_value_payments': 0,
        'high_spent_medium_value_payments': -1,
        'high_spent_large_value_payments': -2,
        'missing': -1,
        'unknown': -1
    }

    if 'Payment_Behaviour' in df.columns:
        df['Payment_Score'] += df['Payment_Behaviour'].map(payment_map).fillna(-1)

    credit_mix_map = {
        'good': 2,
        'standard': 1,
        'bad': -2,
        'missing': -1
    }

    if 'Credit_Mix' in df.columns:
        df['Payment_Score'] += df['Credit_Mix'].map(credit_mix_map).fillna(-1)

    if 'Num_of_Delayed_Payment' in df.columns:
        df['Payment_Score'] += np.where(df['Num_of_Delayed_Payment'] == 0, 2,
                                      np.where(df['Num_of_Delayed_Payment'] <= 3, 1,
                                              np.where(df['Num_of_Delayed_Payment'] <= 7, -1, -2)))

    df['target'] = (df['Payment_Score'] > df['Payment_Score'].quantile(0.4)).astype(int)

    df['target_explanation'] = np.where(df['target'] == 1,
                                       "Good credit risk based on payment behavior and credit mix",
                                       "Poor credit risk based on payment behavior and credit mix")

    return df

data = create_target(data)

def create_features(df):
    if 'Annual_Income' in df.columns and 'Outstanding_Debt' in df.columns:
        df['Debt_to_Income'] = np.log1p(df['Outstanding_Debt']) / (np.log1p(df['Annual_Income']) + 1e-6)

    if 'Credit_Utilization_Ratio' in df.columns:
        df['Utilization_Ratio'] = np.clip(df['Credit_Utilization_Ratio'] / 100, 0, 1)

    if 'Num_of_Delayed_Payment' in df.columns and 'Num_of_Loan' in df.columns:
        df['Delay_Ratio'] = (df['Num_of_Delayed_Payment'] + 1) / (df['Num_of_Loan'] + 3)

    if 'Total_EMI_per_month' in df.columns and 'Monthly_Inhand_Salary' in df.columns:
        df['EMI_to_Income'] = np.log1p(df['Total_EMI_per_month']) / (np.log1p(df['Monthly_Inhand_Salary']) + 1e-6)

    if 'Age' in df.columns:
        df['Age_Group'] = pd.cut(df['Age'], bins=[0, 25, 35, 45, 55, 65, 100],
                                labels=['18-25', '26-35', '36-45', '46-55', '56-65', '65+'])

    if 'Credit_History_Age_Months' in df.columns:
        df['Credit_History_Length'] = pd.cut(df['Credit_History_Age_Months'],
                                           bins=[0, 12, 36, 60, 120, 240, 600],
                                           labels=['<1yr', '1-3yrs', '3-5yrs',
                                                  '5-10yrs', '10-20yrs', '20+yrs'])

    if 'Num_Credit_Card' in df.columns and 'Num_of_Loan' in df.columns:
        df['Credit_Products_Count'] = df['Num_Credit_Card'] + df['Num_of_Loan']

    return df

data = create_features(data)

from sklearn.model_selection import train_test_split

numeric_features = ['Age', 'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts',
                   'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan', 'Delay_from_due_date',
                   'Changed_Credit_Limit', 'Num_Credit_Inquiries', 'Outstanding_Debt',
                   'Credit_Utilization_Ratio', 'Credit_History_Age_Months', 'Total_EMI_per_month',
                   'Amount_invested_monthly', 'Monthly_Balance', 'Debt_to_Income',
                   'Utilization_Ratio', 'Delay_Ratio', 'EMI_to_Income']

categorical_features = ['Age_Group', 'Credit_History_Length', 'Payment_Behaviour', 'Credit_Mix']

features = numeric_features + [f for f in categorical_features if f in data.columns]
target = 'target'

data = data.dropna(subset=features + [target])

label_encoders = {}
for col in categorical_features:
    if col in data.columns:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col].astype(str))
        label_encoders[col] = le

X = data[features]
y = data[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y)

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train[numeric_features])
X_test_scaled = scaler.transform(X_test[numeric_features])

with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
with open('label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)

models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42),
    'SVM': CalibratedClassifierCV(SVC(kernel='rbf', probability=True, class_weight='balanced', random_state=42))
}

voting_clf = VotingClassifier(
    estimators=[(name, model) for name, model in models.items()],
    voting='soft')

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    print(f"{name} trained")

voting_clf.fit(X_train_scaled, y_train)
print("Voting classifier trained")

with open('credit_score_models.pkl', 'wb') as f:
    pickle.dump({'models': models, 'voting': voting_clf}, f)

def evaluate_model(model, X, y, model_name="Model"):
    y_pred = model.predict(X)
    y_prob = model.predict_proba(X)[:, 1]

    cm = confusion_matrix(y, y_pred)
    plt.figure(figsize=(6, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Bad', 'Good'],
                yticklabels=['Bad', 'Good'])
    plt.title(f'{model_name} Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

    auc = roc_auc_score(y, y_prob)
    print(f"{model_name} AUC: {auc:.3f}")

    fpr, tpr, _ = roc_curve(y, y_prob)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'{model_name} ROC Curve')
    plt.legend()
    plt.show()

    precision, recall, _ = precision_recall_curve(y, y_prob)
    ap = average_precision_score(y, y_prob)
    plt.figure(figsize=(8, 6))
    plt.plot(recall, precision, label=f'{model_name} (AP = {ap:.2f})')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'{model_name} Precision-Recall Curve')
    plt.legend()
    plt.show()

    return auc

print("Individual Model Performance:")
model_performance = {}
for name, model in models.items():
    print(f"\n{name} Evaluation:")
    auc = evaluate_model(model, X_test_scaled, y_test, name)
    model_performance[name] = auc

print("\nVoting Classifier Evaluation:")
voting_auc = evaluate_model(voting_clf, X_test_scaled, y_test, "Voting Classifier")
model_performance['Voting'] = voting_auc

plt.figure(figsize=(12, 8))
lr_coef = models['Logistic Regression'].coef_[0]
sorted_idx = np.argsort(np.abs(lr_coef))[::-1]
plt.barh(np.array(features)[sorted_idx][:15], lr_coef[sorted_idx][:15])
plt.title('Top 15 Logistic Regression Feature Importance (Absolute Coefficients)')
plt.xlabel('Coefficient Value')
plt.tight_layout()
plt.show()

plt.figure(figsize=(12, 8))
rf_importance = models['Random Forest'].feature_importances_
sorted_idx = np.argsort(rf_importance)[::-1]
plt.barh(np.array(features)[sorted_idx][:15], rf_importance[sorted_idx][:15])
plt.title('Top 15 Random Forest Feature Importance')
plt.xlabel('Importance Score')
plt.tight_layout()
plt.show()

explainer = shap.TreeExplainer(models['Random Forest'])
shap_values = explainer.shap_values(X_test[features])
shap.summary_plot(shap_values[1], X_test[features], feature_names=features, plot_type="bar")

def create_scorecard(model, features, pdo=50, score=600, odds=20):
    if not hasattr(model, 'coef_'):
        print("Warning: Model doesn't have coefficients. Using Random Forest feature importances instead.")
        coef = model.feature_importances_
        intercept = 0
    else:
        coef = model.coef_[0]
        intercept = model.intercept_[0]

    factor = pdo / np.log(2)
    offset = score - factor * np.log(odds)

    scorecard = pd.DataFrame({'Feature': features, 'Coefficient': coef})

    total_effect = intercept + sum(coef)
    if total_effect == 0:
        total_effect = 1

    scorecard['Points'] = scorecard['Coefficient'].apply(
        lambda x: -x * (factor / (np.log(2) * total_effect)))

    scorecard['Rank'] = scorecard['Coefficient'].abs().rank(ascending=False)

    return scorecard, offset

scorecard, base_score = create_scorecard(models['Logistic Regression'], features)
print("\nScorecard:")
print(scorecard.sort_values('Points', ascending=False).head(10))
print(f"Base Score: {base_score:.1f}")

scorecard.to_csv('enhanced_credit_scorecard.csv', index=False)

def calculate_score(row, scorecard, base_score, min_score=300, max_score=850):
    score = base_score
    for _, sc_row in scorecard.iterrows():
        feature = sc_row['Feature']
        if feature in row:
            score += sc_row['Points'] * row[feature]

    score = max(min_score, min(max_score, score))
    return score

data['Score'] = data.apply(lambda row: calculate_score(row, scorecard, base_score), axis=1)

def get_rating(score):
    if score >= 800:
        return 'Excellent (800+)'
    elif score >= 740:
        return 'Very Good (740-799)'
    elif score >= 670:
        return 'Good (670-739)'
    elif score >= 580:
        return 'Fair (580-669)'
    else:
        return 'Poor (<580)'

data['Credit_Rating'] = data['Score'].apply(get_rating)

plt.figure(figsize=(12, 8))
rating_order = ['Poor (<580)', 'Fair (580-669)', 'Good (670-739)',
               'Very Good (740-799)', 'Excellent (800+)']
sns.violinplot(x='Credit_Rating', y='Score', data=data,
              order=rating_order, palette='RdYlGn')
plt.title('Credit Score Distribution by Rating Category')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

plt.figure(figsize=(10, 6))
rating_dist = data['Credit_Rating'].value_counts(normalize=True).loc[rating_order]
ax = rating_dist.plot(kind='bar', color=sns.color_palette('RdYlGn', len(rating_order)))
plt.title('Credit Rating Distribution')
plt.xlabel('Credit Rating')
plt.ylabel('Percentage')
for p in ax.patches:
    ax.annotate(f'{p.get_height()*100:.1f}%',
                (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', xytext=(0, 10),
                textcoords='offset points')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

plt.figure(figsize=(14, 12))
corr = data[numeric_features + ['Score']].corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask, annot=True, fmt='.2f', cmap='coolwarm',
           center=0, vmin=-1, vmax=1)
plt.title('Feature Correlation Heatmap')
plt.tight_layout()
plt.show()

plt.figure(figsize=(10, 6))
pd.Series(model_performance).sort_values().plot(kind='barh', color='skyblue')
plt.title('Model Performance Comparison (AUC)')
plt.xlabel('AUC Score')
plt.xlim(0.5, 1.0)
for i, v in enumerate(pd.Series(model_performance).sort_values()):
    plt.text(v + 0.01, i, f"{v:.3f}", color='black', va='center')
plt.tight_layout()
plt.show()

data.to_csv('enhanced_scored_customers.csv', index=False)

print("Enhanced credit scoring pipeline completed successfully!")

FileNotFoundError: [Errno 2] No such file or directory: '/content/train.csv'