In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import r2_score
import psutil
import time
import random 

## Data Preprocess

In [2]:
data = pd.read_csv('philippines_model.csv')

  data = pd.read_csv('philippines_model.csv')


In [3]:
columns_to_drop = ['emotional_valid_count', 'emotional_valid_sum', 'emotional_avg', 'physical_valid_count', 'physical_valid_sum', 'physical_avg','sexual_valid_count','sexual_valid_sum', 'sexual_avg', 'overall_valid_count', 'overall_valid_sum', 'unique_id', 'caseid']
data_cleaned = data.drop(columns=columns_to_drop)

In [5]:
# Delete one value dominated columns
threshold = 0.8

columns_to_drop = [col for col in data_cleaned.columns if data_cleaned[col].value_counts(normalize=True).max() >= threshold]

data_cleaned_1 = data_cleaned.drop(columns=columns_to_drop)

print(data_cleaned_1.shape)

(35961, 615)


In [6]:
# Delete highly correlated columns
y = 'overall_avg'
year = 'v007'

data_numeric = data_cleaned_1.drop(columns=[y]).select_dtypes(include=['int64', 'float64'])
corr_matrix = data_numeric.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))


threshold = 0.9
columns_to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

data_cleaned_2 = data_cleaned_1.drop(columns=columns_to_drop)
data_cleaned_2[year] = data_cleaned_1[year]
data_cleaned_2["overall_avg"] = data_cleaned_2[y]

print(data_cleaned_2.shape)

(35961, 407)


## XGBoost

In [10]:
random.seed(0)

X = data_cleaned_2.drop(columns=['overall_avg'])  # 输入特征
y = data_cleaned_2['overall_avg']  # 目标变量


# threshold for catgorical variables
unique_threshold = 18  

for col in X.select_dtypes(include=['int']).columns:
    if X[col].nunique() < unique_threshold:
        X[col] = X[col].astype('category')

for col in X.select_dtypes(include=['object']).columns:
    X[col] = X[col].astype('category')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


start_time = time.time()
process = psutil.Process()
start_mem = process.memory_info().rss / 1024 ** 2 


model = xgb.XGBRegressor(missing=np.nan, enable_categorical=True)

model.fit(X_train, y_train)

train_time = time.time() - start_time
end_mem = process.memory_info().rss / 1024 ** 2  # 转换为 MB


y_pred = model.predict(X_test)

# result
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
print(f'Mean Absolute Error: {mae}')
print(f'R-squared: {r2}')
print(f'Train time: {train_time:.2f} seconds')
print(f'Memory used during training: {end_mem - start_mem:.2f} MB')


Mean Squared Error: 0.011765095801433128
Root Mean Squared Error: 0.10846702633258241
Mean Absolute Error: 0.03846933972079262
R-squared: 0.8472037033441899
Train time: 2.31 seconds
Memory used during training: 129.77 MB


## LightGBM

In [12]:
random.seed(0)

X = data_cleaned_2.drop(columns=['overall_avg'])  # 输入特征
y = data_cleaned_2['overall_avg']  # 目标变量


unique_threshold = 18  

for col in X.select_dtypes(include=['int']).columns:
    if X[col].nunique() < unique_threshold:
        X[col] = X[col].astype('category')

for col in X.select_dtypes(include=['object']).columns:
    X[col] = X[col].astype('category')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

categorical_features = [
    i for i, col in enumerate(X_train.columns) if X_train[col].dtype.name == 'category']


process = psutil.Process()
start_mem = process.memory_info().rss / 1024 ** 2  

params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'mse',
    'max_bin': 400,
    'num_leaves': 30,
    'learning_rate': 0.1,
    'verbose': -1
}

train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_features)
valid_data = lgb.Dataset(X_test, label=y_test, reference=train_data)


start_time = time.time()

model = lgb.train(
    params,
    train_data,
    num_boost_round=100, 
    valid_sets=[valid_data],
    callbacks=[lgb.early_stopping(stopping_rounds=7)]  
)

train_time = time.time() - start_time
end_mem = process.memory_info().rss / 1024 ** 2 

y_pred = model.predict(X_test, num_iteration=model.best_iteration)

# result
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
print(f'Mean Absolute Error: {mae}')
print(f'R-squared: {r2}')
print(f'Train time: {train_time:.2f} seconds')
print(f'Memory used during training: {end_mem - start_mem:.2f} MB')


Training until validation scores don't improve for 7 rounds
Early stopping, best iteration is:
[78]	valid_0's l2: 0.0102848
Mean Squared Error: 0.010284768197090196
Root Mean Squared Error: 0.10141384618034263
Mean Absolute Error: 0.03556501736053339
R-squared: 0.8664290950960714
Train time: 1.05 seconds
Memory used during training: 325.25 MB


## CatBoost

In [13]:
random.seed(0)

X = data_cleaned_2.drop(columns=['overall_avg'])  # 输入特征
y = data_cleaned_2['overall_avg']  # 目标变量

for col in X.select_dtypes(include=['category', 'object']).columns:
    X[col] = X[col].astype(str).fillna('NaN')

unique_threshold = 18 

for col in X.select_dtypes(include=['int']).columns:
    if X[col].nunique() < unique_threshold:
        X[col] = X[col].astype('category')

for col in X.select_dtypes(include=['object']).columns:
    X[col] = X[col].astype('category')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

categorical_features = X_train.select_dtypes(include=['category']).columns.tolist()

start_time = time.time()
process = psutil.Process()
start_mem = process.memory_info().rss / 1024 ** 2  # 转换为 MB

model = CatBoostRegressor(iterations=500, learning_rate=0.1, depth=6, loss_function='RMSE', verbose=False)

model.fit(X_train, y_train, cat_features=categorical_features)


train_time = time.time() - start_time
y_pred = model.predict(X_test)

# result
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
print(f'Mean Absolute Error: {mae}')
print(f'R-squared: {r2}')
print(f'Train time: {train_time:.2f} seconds')
print(f'Memory used during training: {end_mem - start_mem:.2f} MB')


Mean Squared Error: 0.010017138115420798
Root Mean Squared Error: 0.10008565389415607
Mean Absolute Error: 0.03551231746109955
R-squared: 0.8699048751528554
Train time: 29.91 seconds
Memory used during training: 314.58 MB
