In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
import warnings

warnings.filterwarnings("ignore")

# Load the data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

# Initial inspection
print("Initial Training Data:\n", train_data.head())
print("Initial Test Data:\n", test_data.head())

# # Feature Engineering: Add LoanToIncomeRatio and FinancialRiskScore
# train_data['LoanToIncomeRatio'] = train_data['LoanAmount'] / train_data['Income']
# train_data['FinancialRiskScore'] = train_data['CreditScore'] - train_data['LoanToIncomeRatio'] + (train_data['MonthsEmployed'] / 12)
# test_data['LoanToIncomeRatio'] = test_data['LoanAmount'] / test_data['Income']
# test_data['FinancialRiskScore'] = test_data['CreditScore'] - test_data['LoanToIncomeRatio'] + (test_data['MonthsEmployed'] / 12)

# Drop 'LoanID' and separate target variable
train_data = train_data.drop(['LoanID'], axis=1)
test_data = test_data.drop(['LoanID'], axis=1)
train_data = train_data.drop(['LoanTerm'], axis=1)
test_data = test_data.drop(['LoanTerm'], axis=1)
X = train_data.drop('Default', axis=1)
y = train_data['Default']

# Encode categorical variables
X_encoded = pd.get_dummies(X, drop_first=True)
test_encoded = pd.get_dummies(test_data, drop_first=True)

# Ensure train and test data have the same columns
missing_cols = set(X_encoded.columns) - set(test_encoded.columns)
for col in missing_cols:
    test_encoded[col] = 0
test_encoded = test_encoded[X_encoded.columns]

# Split the data into training and testing sets
X_train, X_val, y_train, y_val = train_test_split(X_encoded, y, test_size=0.2, random_state=96, stratify=y)

# Standardizing the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled_full = scaler.transform(test_encoded)

# Initialize and train the Gradient Boosting model
model = GradientBoostingClassifier(random_state=96)
model.fit(X_train_scaled, y_train)

# Evaluate the model on the validation set
y_val_pred = model.predict(X_val_scaled)
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nValidation Classification Report:")
print(classification_report(y_val, y_val_pred))

# Make predictions on the test set
test_predictions = model.predict(X_test_scaled_full)

# Save predictions in submission format
submission = sample_submission.copy()
submission['Default'] = test_predictions
submission.to_csv('submission.csv', index=False)
print("Submission file created at 'submission.csv'")

# 88.758