In [99]:
import pandas as pd

train_data = pd.read_csv('Train.csv')
test_data = pd.read_csv('Test.csv')

In [100]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11879 entries, 0 to 11878
Data columns (total 27 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   Id                                        11879 non-null  int64  
 1   Sex                                       10449 non-null  object 
 2   Sex Code                                  10426 non-null  object 
 3   State                                     10482 non-null  object 
 4   State Code                                11879 non-null  float64
 5   Year                                      7407 non-null   float64
 6   Year Code                                 7352 non-null   float64
 7   Ten-Year Age Groups                       10379 non-null  object 
 8   Ten-Year Age Groups Code                  10423 non-null  object 
 9   % of Total Deaths                         11879 non-null  float64
 10  Population                        

In [101]:
print(f"Rows Before: {len(train_data)}")

Rows Before: 11879


In [102]:
train_data.dropna(subset=["Target"] , inplace=True)

In [103]:
print(f"Rows After: {len(train_data)}")

Rows After: 7459


In [104]:

drop_cols = [
    'Id',
    'Sex',
    'temp_sensor_readout',
    'qc_flag_batch_3',
    'legacy_index_offset',
    'adjusted_pop_trend',
    'confidence_spread_metric',
    'Crude Rate Lower 95% Confidence Interval',
    'Crude Rate Upper 95% Confidence Interval',
    'Crude Rate Standard Error',
    'Year_dt',
    'State',
    'Crude Rate',
    'Population', 
    'Year Code',
    'Ten-Year Age Groups',
    'Contributing_Cause',
    'Underlying_Cause'
]

# Drop from train (includes temporal_alignment_proxy)
train_data.drop(columns=drop_cols + ['temporal_alignment_proxy'], inplace=True)

# Drop from test (no temporal_alignment_proxy)
test_data.drop(columns=drop_cols, inplace=True)

print(f"Train: {train_data.shape}")
print(f"Test: {test_data.shape}")

Train: (7459, 8)
Test: (5092, 7)


In [105]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7459 entries, 2 to 11878
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Sex Code                  6579 non-null   object 
 1   State Code                7459 non-null   float64
 2   Year                      4661 non-null   float64
 3   Ten-Year Age Groups Code  6531 non-null   object 
 4   % of Total Deaths         7459 non-null   float64
 5   State_Age_Combo           7459 non-null   object 
 6   Manner_of_Death           7459 non-null   int64  
 7   Target                    7459 non-null   float64
dtypes: float64(4), int64(1), object(3)
memory usage: 524.5+ KB


In [117]:
from sklearn.preprocessing import LabelEncoder

categorical_columns = ['Sex Code', 'Ten-Year Age Groups Code', 'State_Age_Combo']
label_encoders = {}

for col in categorical_columns:
    le = LabelEncoder()
    
    # Combine train and test for this column to learn ALL categories
    combined = pd.concat([train_data[col], test_data[col]], axis=0).astype(str)
    
    # Fit on combined data
    le.fit(combined)
    
    # Transform both train and test
    train_data[col] = le.transform(train_data[col].astype(str))
    test_data[col] = le.transform(test_data[col].astype(str))
    
    label_encoders[col] = le


In [118]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7459 entries, 2 to 11878
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Sex Code                  7459 non-null   int32  
 1   State Code                7459 non-null   float64
 2   Year                      7459 non-null   float64
 3   Ten-Year Age Groups Code  7459 non-null   int32  
 4   % of Total Deaths         7459 non-null   float64
 5   State_Age_Combo           7459 non-null   int32  
 6   Manner_of_Death           7459 non-null   int64  
 7   Target                    7459 non-null   float64
dtypes: float64(4), int32(3), int64(1)
memory usage: 437.1 KB


In [119]:
from sklearn.preprocessing import LabelEncoder


encoder_sex = LabelEncoder()
encoder_age = LabelEncoder()
encoder_combo = LabelEncoder()

# Encode Sex Code
train_data['Sex Code'] = encoder_sex.fit_transform(train_data['Sex Code'].astype(str))
test_data['Sex Code'] = encoder_sex.transform(test_data['Sex Code'].astype(str))

# Encode Ten-Year Age Groups Code
train_data['Ten-Year Age Groups Code'] = encoder_age.fit_transform(train_data['Ten-Year Age Groups Code'].astype(str))
test_data['Ten-Year Age Groups Code'] = encoder_age.transform(test_data['Ten-Year Age Groups Code'].astype(str))


In [141]:
import xgboost as xgb
X = train_data.drop('Target', axis=1)
y = train_data['Target']


xgb_model = xgb.XGBRegressor(
    n_estimators=150,           # Fewer trees
    max_depth=4,                # Much shallower trees (was 6)
    learning_rate=0.05,         # Slower learning (was 0.1)
    subsample=0.7,              # Less data per tree (was 0.8)
    colsample_bytree=0.7,       # Fewer features per tree (was 0.8)
    reg_alpha=1.0,              # Stronger L1 regularization (was 0.1)
    reg_lambda=10.0,            # Much stronger L2 regularization (was 1.0)
    gamma=1.0,                  # Minimum loss reduction (was 0)
    min_child_weight=10,        # More samples needed (was 3)
    random_state=42,
    n_jobs=-1,
    tree_method='hist'
)

# Train the model
print("\nTraining XGBoost model...")
xgb_model.fit(X, y, verbose=False)


Training XGBoost model...


In [142]:
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score

train_predictions = xgb_model.predict(X)
train_rmse = np.sqrt(mean_squared_error(y, train_predictions))
train_r2 = r2_score(y, train_predictions)

print(f"\nTraining RMSE: {train_rmse:.4f}")
print(f"Training R²: {train_r2:.4f}")


Training RMSE: 11748.0128
Training R²: 0.3187


In [143]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(xgb_model, X, y, cv=5, 
                            scoring='neg_mean_squared_error', n_jobs=-1)
cv_rmse = np.sqrt(-cv_scores.mean())

print(f"Training RMSE: {train_rmse:.2f}")
print(f"Cross-Validation RMSE: {cv_rmse:.2f}")
print(f"Difference: {cv_rmse - train_rmse:.2f}")

Training RMSE: 11748.01
Cross-Validation RMSE: 12807.92
Difference: 1059.91
