In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# For data preprocessing
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# For modeling
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# Models
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

In [None]:

# Load data
print("Loading data...")
train = pd.read_csv('./data/Train.csv')
test = pd.read_csv('./data/Test.csv')
sample_submission = pd.read_csv('./data/SampleSubmission.csv')

# Display info
print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

# Combine datasets for preprocessing
train['is_train'] = 1
test['is_train'] = 0
test['target'] = -1  # Placeholder for test set
combined = pd.concat([train, test], axis=0, ignore_index=True)

# Check missing values
print("\nMissing values before imputation:")
print(combined.isnull().sum().sort_values(ascending=False))

# Feature Engineering
print("\nPerforming feature engineering...")

# Convert date columns to datetime
date_cols = [col for col in combined.columns if 'Date' in col]
for col in date_cols:
    combined[col] = pd.to_datetime(combined[col], errors='coerce')

# Extract date features
for col in date_cols:
    # Basic date components
    combined[f'{col}_year'] = combined[col].dt.year
    combined[f'{col}_month'] = combined[col].dt.month
    combined[f'{col}_day'] = combined[col].dt.day
    combined[f'{col}_dayofweek'] = combined[col].dt.dayofweek
    combined[f'{col}_quarter'] = combined[col].dt.quarter
    
    # Is weekend
    combined[f'{col}_is_weekend'] = combined[col].dt.dayofweek.isin([5, 6]).astype(int)

# Calculate policy duration in days
combined['policy_duration'] = (combined['Policy End Date'] - combined['Policy Start Date']).dt.days

# Calculate days since first transaction
combined['days_since_first_transaction'] = (combined['Policy Start Date'] - combined['First Transaction Date']).dt.days

# Age groups
combined['age_group'] = pd.cut(combined['Age'], 
                               bins=[0, 25, 35, 45, 55, 65, 100], 
                               labels=['<25', '25-35', '35-45', '45-55', '55-65', '65+'])

# Clean and transform categorical features
print("\nHandling categorical features...")

# Gender cleaning
gender_mapping = {
    'Entity': 'Other', 
    'Joint Gender': 'Other', 
    'NOT STATED': 'Unknown', 
    'NO GENDER': 'Unknown', 
    'SEX': 'Unknown'
}
combined['Gender'] = combined['Gender'].replace(gender_mapping)

# No_Pol groups
combined['No_Pol_group'] = pd.cut(combined['No_Pol'], 
                                 bins=[0, 1, 3, 5, 10, float('inf')], 
                                 labels=['Single', '2-3', '4-5', '6-10', '10+'])

# Identify categorical columns
cat_cols = combined.select_dtypes(include=['object', 'category']).columns.tolist()
cat_cols.append('age_group')
cat_cols.append('No_Pol_group')
cat_cols = [x for x in cat_cols if x not in ['ID', 'target'] + date_cols]

# Identify numerical columns  
num_cols = ['Age', 'No_Pol', 'policy_duration', 'days_since_first_transaction']
num_cols.extend([col for col in combined.columns if ('_year' in col or '_month' in col or 
                                                   '_day' in col or '_dayofweek' in col or 
                                                   '_quarter' in col or '_is_weekend' in col)])

# Handle missing values
print("\nHandling missing values...")
# For numerical columns use median
for col in num_cols:
    if combined[col].isnull().any():
        combined[col] = combined[col].fillna(combined[col].median())

# For categorical columns use mode
for col in cat_cols:
    if combined[col].isnull().any():
        combined[col] = combined[col].fillna(combined[col].mode()[0])

# Check for high cardinality categories
print("\nCategorical feature cardinality:")
for col in cat_cols:
    print(f"{col}: {combined[col].nunique()} unique values")

# Frequency encode high-cardinality categorical features
high_card_cols = [col for col in cat_cols if combined[col].nunique() > 10]
for col in high_card_cols:
    freq_map = combined[col].value_counts(normalize=True).to_dict()
    combined[f'{col}_freq'] = combined[col].map(freq_map)
    num_cols.append(f'{col}_freq')

# Split back into train and test
train_processed = combined[combined['is_train'] == 1].drop('is_train', axis=1)
test_processed = combined[combined['is_train'] == 0].drop(['is_train', 'target'], axis=1)

# Drop date columns which we've extracted features from
train_processed = train_processed.drop(date_cols, axis=1)
test_processed = test_processed.drop(date_cols, axis=1)

# Define features and target
X = train_processed.drop(['ID', 'target'], axis=1)
y = train_processed['target']
X_test = test_processed.drop(['ID'], axis=1)

# Get updated cat_cols (excluding any we've dropped)
cat_cols = [col for col in cat_cols if col in X.columns]
num_cols = [col for col in num_cols if col in X.columns]

print(f"\nFeatures: {X.shape[1]} columns")
print(f"Categorical: {len(cat_cols)} columns")
print(f"Numerical: {len(num_cols)} columns")

# Check target distribution
print("\nTarget distribution:")
print(y.value_counts(normalize=True))

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), num_cols),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ]), cat_cols)
    ],
    remainder='passthrough'
)

# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define models
print("\nTraining models...")

# Create a stratified k-fold object
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Train LightGBM model with SMOTE
print("Training LightGBM model...")
lgbm_pipeline = ImbPipeline([
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', LGBMClassifier(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=7,
        num_leaves=31,
        min_child_samples=20,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.1,
        reg_lambda=0.1,
        random_state=42,
        class_weight='balanced',
        n_jobs=-1
    ))
])

# Fit the pipeline
lgbm_pipeline.fit(X_train, y_train)

# Make predictions on validation set
y_val_pred_lgbm = lgbm_pipeline.predict(X_val)

# Evaluate model
val_f1_lgbm = f1_score(y_val, y_val_pred_lgbm)
print(f"LightGBM Validation F1 Score: {val_f1_lgbm:.4f}")
print(classification_report(y_val, y_val_pred_lgbm))

# Train XGBoost model
print("\nTraining XGBoost model...")
xgb_pipeline = ImbPipeline([
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', XGBClassifier(
        n_estimators=200,
        learning_rate=0.1,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=1,
        random_state=42,
        n_jobs=-1
    ))
])

# Fit the pipeline
xgb_pipeline.fit(X_train, y_train)

# Make predictions on validation set
y_val_pred_xgb = xgb_pipeline.predict(X_val)

# Evaluate model
val_f1_xgb = f1_score(y_val, y_val_pred_xgb)
print(f"XGBoost Validation F1 Score: {val_f1_xgb:.4f}")
print(classification_report(y_val, y_val_pred_xgb))

# Create an ensemble model
print("\nCreating ensemble model...")
ensemble = VotingClassifier(estimators=[
    ('lgbm', lgbm_pipeline.named_steps['classifier']),
    ('xgb', xgb_pipeline.named_steps['classifier'])
], voting='soft')

# We need to preprocess and apply SMOTE before ensemble
X_train_prep = preprocessor.fit_transform(X_train)
X_val_prep = preprocessor.transform(X_val)
X_test_prep = preprocessor.transform(X_test)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_prep, y_train)

# Train ensemble
ensemble.fit(X_train_smote, y_train_smote)

# Make predictions
y_val_pred_ensemble = ensemble.predict(X_val_prep)
val_f1_ensemble = f1_score(y_val, y_val_pred_ensemble)
print(f"Ensemble Validation F1 Score: {val_f1_ensemble:.4f}")
print(classification_report(y_val, y_val_pred_ensemble))

# Choose the best model
best_model_name = max([('lgbm', val_f1_lgbm), ('xgb', val_f1_xgb), ('ensemble', val_f1_ensemble)], key=lambda x: x[1])[0]
print(f"\nBest model: {best_model_name}")

# Make predictions on test set with the best model
if best_model_name == 'lgbm':
    test_preds = lgbm_pipeline.predict(X_test)
elif best_model_name == 'xgb':
    test_preds = xgb_pipeline.predict(X_test)
else:
    test_preds = ensemble.predict(X_test_prep)

# Create submission file
submission = pd.DataFrame({
    'ID': test_processed['ID'],
    'target': test_preds
})

# Save the submission file
submission.to_csv('improved_submission.csv', index=False)
print("\nSubmission file created: improved_submission.csv")

# Feature importance for LGBM (if it's the best model)
if best_model_name == 'lgbm':
    print("\nTop 20 features by importance (LGBM):")
    lgbm_model = lgbm_pipeline.named_steps['classifier']
    feature_names = num_cols + cat_cols  # This is simplified, actual feature names after preprocessing would be different
    feature_importance = pd.DataFrame({
        'feature': feature_names,
        'importance': lgbm_model.feature_importances_
    })
    feature_importance = feature_importance.sort_values('importance', ascending=False).head(20)
    print(feature_importance)