In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import joblib
import os
import warnings
import xgboost 
from sklearn.decomposition import PCA

def custom_warning_handler(message, category, filename, lineno, file=None, line=None):
    print("Model Created...")
warnings.showwarning = custom_warning_handler

In [2]:
# Custom RMSE function
def normalized_rmse(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    return rmse / np.std(y_true)

# Function to train the XGBoost model for a given emotion
def train_xgboost_for_emotion(df, emotion, save_results):
    # Define the features and target
    X = df.iloc[:, 4:-53]
    y = df[emotion]

    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=207026618)

    # StandardScaler for X
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    
    # PCA
    pca = PCA()
    X_train_pca = pca.fit_transform(X_train_scaled)
    X_test_pca = pca.transform(X_test_scaled)
    explained_variance = np.cumsum(pca.explained_variance_ratio_)
    n_components = np.argmax(explained_variance >= 0.9999) + 1
    pca = PCA(n_components=n_components)
    X_train_pca_reduced = pca.fit_transform(X_train_scaled)
    X_test_pca_reduced = pca.transform(X_test_scaled)
    pca_loadings = pca.components_.T
    X_train_reduced = X_train_pca_reduced
    X_test_reduced = X_test_pca_reduced
    
    
    # Create bins for y_train for StratifiedKFold
    n_bins = int(np.sqrt(len(y_train)))
    y_train_binned = pd.cut(y_train, bins=n_bins, labels=False)

    # Initialize StratifiedKFold
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=207026618)

    # Define the parameter grid
    param_grid = {
    'n_estimators': [100, 200, 300],  # Expanding n_estimators to higher values for better performance
    'max_depth': [3, 5, 7],  # Testing both simpler and more complex trees
    'learning_rate': [0.01, 0.05, 0.1],  # Trying faster learning rates
    'min_child_weight': [1, 3, 5],  # Exploring different levels of regularization
    'subsample': [0.6],  # Experimenting with more/less subsampling
    'colsample_bytree': [0.6],  # Testing different levels of feature subsampling
    'tree_method': 'hist',  # Keeping the efficient histogram method
    'device': 'cuda',  # Using GPU acceleration
    'random_state': 207026618  # Keeping random_state fixed for reproducibility
    }

    # Initialize variables to store best model and score
    best_params = None
    best_score = float('inf')
    fit=1
    # Perform grid search with cross-validation
    for n_estimators in param_grid['n_estimators']:
        for max_depth in param_grid['max_depth']:
            for learning_rate in param_grid['learning_rate']:
                for min_child_weight in param_grid['min_child_weight']:
                    for subsample in param_grid['subsample']:
                        for colsample_bytree in param_grid['colsample_bytree']:
                            cv_scores = []
                            for train_index, val_index in skf.split(X_train_reduced, y_train_binned):
                                X_train_cv, X_val_cv = X_train_reduced[train_index], X_train_reduced[val_index]
                                y_train_cv, y_val_cv = y_train.iloc[train_index], y_train.iloc[val_index]
                                
                                model = XGBRegressor(
                                    n_estimators=n_estimators,
                                    max_depth=max_depth,
                                    learning_rate=learning_rate,
                                    min_child_weight=min_child_weight,
                                    subsample=subsample,
                                    colsample_bytree=colsample_bytree,
                                    tree_method='hist',
                                    device='cuda',
                                    random_state=207026618
                                )
                                model.fit(X_train_cv, y_train_cv)
                                y_pred_cv = model.predict(X_val_cv)
                                rmse = np.sqrt(mean_squared_error(y_val_cv, y_pred_cv))
                                cv_scores.append(rmse)
                                print(fit)
                                fit=fit+1
                            mean_cv_score = np.mean(cv_scores)
                            if mean_cv_score < best_score:
                                best_score = mean_cv_score
                                best_params = {
                                    'n_estimators': n_estimators,
                                    'max_depth': max_depth,
                                    'learning_rate': learning_rate,
                                    'min_child_weight': min_child_weight,
                                    'subsample': subsample,
                                    'colsample_bytree': colsample_bytree,
                                    'tree_method': 'hist',
                                    'device':'cuda',
                                    'random_state': 207026618
                                }

    # Train the final model with the best parameters
    best_model = XGBRegressor(**best_params)
    best_model.fit(X_train_reduced, y_train)

    # Make predictions on the test set
    y_pred = best_model.predict(X_test_reduced)

    # Calculate RMSE and Normalized RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    norm_rmse = normalized_rmse(y_test, y_pred)

    # Save predictions and true values to CSV
    results_df = pd.DataFrame({'y_true': y_test, 'y_pred': y_pred})
    results_df.to_csv(f'{save_results}\\predictions\\{emotion}_predictions.csv', index=False)

    # Save RMSE and Normalized RMSE to CSV
    metrics_df = pd.DataFrame({'Y': [emotion], 'RMSE': [rmse], 'Normalized RMSE': [norm_rmse]})
    metrics_df.to_csv(f'{save_results}\\{emotion}_rmse_metrics.csv', index=False)

    # Save the model and scaler for future use
    joblib.dump(best_model, f'{save_results}\\joblib\\{emotion}_best_xgboost_model.joblib')
    joblib.dump(scaler, f'{save_results}\\joblib\\{emotion}_scaler.joblib')

# Main code to run the function for all emotions
def run_for_all_emotions(df, emotions, save_results):
    for emotion in emotions:
        print(f"Training XGBoost model for emotion: {emotion}")
        train_xgboost_for_emotion(df, emotion, save_results)

# Example usage:
path_data = "Y:\\Data Science Readings\\Applied Project Semester B\\final_data_merged.csv"
save_results = "Y:\\Data Science Readings\\Applied Project Semester B\\xgboost\\xgboost_original_data\\"
df = pd.read_csv(path_data)

emotions = ['joy','despair','interest','valence','arousal']
emotions = ['interest','valence','arousal']

run_for_all_emotions(df, emotions, save_results)


Training XGBoost model for emotion: interest
Model Created...
Model Created...
1
2
3
4
5
Model Created...
6
7
8
9
10
Model Created...
11
12
13
14
15
Model Created...
16
17
18
19
20
Model Created...
21
22
23
24
25
Model Created...
26
27
28
29
30
Model Created...
31
32
33
34
35
Model Created...
36
37
38
39
40
Model Created...
41
42
43
44
45
Model Created...
46
47
48
49
50
Model Created...
51
52
53
54
55
Model Created...
56
57
58
59
60
Model Created...
61
62
63
64
65
Model Created...
66
67
68
69
70
Model Created...
71
72
73
74
75
Model Created...
76
77
78
79
80
Model Created...
81
82
83
84
85
Model Created...
86
87
88
89
90
Model Created...
91
92
93
94
95
Model Created...
96
97
98
99
100
Model Created...
101
102
103
104
105
Model Created...
106
107
108
109
110
Model Created...
111
112
113
114
115
Model Created...
116
117
118
119
120
Model Created...
121
122
123
124
125
Model Created...
126
127
128
129
130
Model Created...
131
132
133
134
135
Model Created...
136
137
138
139
140
Model Crea