In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier


In [3]:
df = pd.read_csv('development_sample.csv')

In [23]:
%pip install ace_tools

Collecting ace_tools
  Downloading ace_tools-0.0-py3-none-any.whl.metadata (300 bytes)
Downloading ace_tools-0.0-py3-none-any.whl (1.1 kB)
Installing collected packages: ace_tools
Successfully installed ace_tools-0.0
Note: you may need to restart the kernel to use updated packages.


In [20]:
# Reload the uploaded files
dev_sample = pd.read_csv('development_sample.csv')
test_sample = pd.read_csv('testing_sample.csv')
var_description = pd.read_excel('Variables_description.xlsx')

# Make copies to work on
dev_df = dev_sample.copy()
test_df = test_sample.copy()

# Convert application_date to datetime
dev_df['application_date'] = pd.to_datetime(dev_df['application_date'], errors='coerce')
test_df['application_date'] = pd.to_datetime(test_df['application_date'], errors='coerce')

# Drop rows with null target (only in development data, as it's our training set)
dev_df = dev_df[~dev_df['target'].isnull()]

# Identify categorical columns
categorical_cols = ['Application_status', 'Var3', 'Var13']
for col in categorical_cols:
    dev_df[col] = dev_df[col].astype('category')
    test_df[col] = test_df[col].astype('category')

In [21]:
# Extract date features
for df in [dev_df, test_df]:
    df['application_year'] = df['application_date'].dt.year
    df['application_month'] = df['application_date'].dt.month
    df['application_dayofweek'] = df['application_date'].dt.dayofweek

# Drop non-predictive identifiers
drop_cols = ['ID', 'customer_id', 'application_date']
dev_df = dev_df.drop(columns=drop_cols)
test_df = test_df.drop(columns=drop_cols)

# Summarize missing values
missing_summary = dev_df.isnull().sum().sort_values(ascending=False)
missing_summary = missing_summary[missing_summary > 0]

In [26]:
# Quick look at shapes
print(f"Development data shape: {dev_df.shape}")
print(f"Test data shape: {test_df.shape}")

# Check column names
print("Columns in development data:", dev_df.columns.tolist())

# Check first few rows
print(dev_df.head())

# Check target balance
print("Target distribution:")
print(dev_df['target'].value_counts(dropna=False))

Development data shape: (36718, 35)
Test data shape: (5000, 35)
Columns in development data: ['target', 'Application_status', 'Var1', 'Var2', 'Var3', 'Var4', 'Var5', 'Var6', 'Var7', 'Var8', 'Var9', 'Var10', 'Var11', 'Var12', 'Var13', 'Var14', 'Var15', 'Var16', 'Var17', 'Var18', 'Var19', 'Var20', 'Var21', 'Var22', 'Var23', 'Var24', 'Var25', 'Var26', 'Var27', 'Var28', 'Var29', 'Var30', 'application_year', 'application_month', 'application_dayofweek']
   target Application_status  Var1  Var2 Var3   Var4  Var5  Var6     Var7  \
0     0.0           Approved     1   2.0    1   7800    99     1   108.73   
1     0.0           Approved     1   1.0    2  11100    78     1   195.99   
2     0.0           Approved     2   3.0    1   2400    15     1   248.34   
3     0.0           Approved     3   1.0    2  11800    30     6  3538.68   
5     0.0           Approved     1   2.0    1   7200    36     1   267.19   

      Var8  ...  Var24    Var25     Var26  Var27 Var28  Var29  Var30  \
0      NaN  

In [31]:
import pandas as pd

# Load datasets
dev_df = pd.read_csv('development_sample.csv')
test_df = pd.read_csv('testing_sample.csv')
var_desc = pd.read_excel('Variables_description.xlsx')

# Quick look at shapes
print(f"Development data shape: {dev_df.shape}")
print(f"Test data shape: {test_df.shape}")

# Check column names
print("Columns in development data:", dev_df.columns.tolist())

# Check first few rows
print(dev_df.head())

# Check target balance
print("Target distribution:")
print(dev_df['target'].value_counts(dropna=False))


Development data shape: (50000, 35)
Test data shape: (5000, 35)
Columns in development data: ['ID', 'customer_id', 'application_date', 'target', 'Application_status', 'Var1', 'Var2', 'Var3', 'Var4', 'Var5', 'Var6', 'Var7', 'Var8', 'Var9', 'Var10', 'Var11', 'Var12', 'Var13', 'Var14', 'Var15', 'Var16', 'Var17', 'Var18', 'Var19', 'Var20', 'Var21', 'Var22', 'Var23', 'Var24', 'Var25', 'Var26', 'Var27', 'Var28', 'Var29', 'Var30']
         ID  customer_id   application_date  target Application_status  Var1  \
0  11034977     32537148  01Feb2010 0:00:00     0.0           Approved     1   
1  11034978     32761663  01Feb2010 0:00:00     0.0           Approved     1   
2  11034979     32701063  01Feb2010 0:00:00     0.0           Approved     2   
3  11034980     32386786  01Feb2010 0:00:00     0.0           Approved     3   
4  11034981     32692110  02Feb2010 0:00:00     NaN           Rejected     1   

   Var2 Var3   Var4  Var5  ...  Var21  Var22  Var23  Var24    Var25     Var26  \
0   2.0   