In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso, ElasticNet
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import mean_squared_error
import time
import os
import pickle

# Constants
SAVE_MODEL_LOCATION = "Y:\\Data Science Readings\\Applied Project Semester B\\xgboost\\Lasso_Aggregate_data\\"
DATA_LOCATION = "Y:\\Data Science Readings\\Applied Project Semester B\\Question 4\\New_Aggregation_Data.csv"
MODEL_TYPE = "Lasso"

def initial_feature_prep(data_loc):
    dataset = pd.read_csv(data_loc)
    # Select only columns where 'X' is followed by a number
    X_names = [col for col in dataset.columns if col.startswith("Feature_") and col[8:].isdigit()] 
    prepped_data = dataset[X_names]
    return prepped_data

def get_model():
    models = {
        "Lasso": (Lasso(random_state=207026618), {
            'alpha': np.linspace(0.0001, 0.1, 50),
        })
    }
    return models[MODEL_TYPE]

Y_val_names = ['arousal','valence','interest','despair','joy']

# Data preparation
data = pd.read_csv(DATA_LOCATION)
x_data = initial_feature_prep(data_loc=DATA_LOCATION)

# Split the data first
X_train, X_test, y_train, y_test = train_test_split(x_data, data[Y_val_names], test_size=0.2, random_state=207026618)

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# PCA
pca = PCA()
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

explained_variance = np.cumsum(pca.explained_variance_ratio_)
n_components = np.argmax(explained_variance >= 0.9999) + 1
print(f'Number of components to explain 99.99% variance: {n_components}')

pca = PCA(n_components=n_components)
X_train_pca_reduced = pca.fit_transform(X_train_scaled)
X_test_pca_reduced = pca.transform(X_test_scaled)
pca_loadings = pca.components_.T
print(f'Reduced train data shape: {X_train_pca_reduced.shape}')
print(f'Reduced test data shape: {X_test_pca_reduced.shape}')

X_train_reduced = pd.DataFrame(X_train_pca_reduced).add_prefix("xnew_")
X_test_reduced = pd.DataFrame(X_test_pca_reduced).add_prefix("xnew_")

# Update train_test_list
train_test_list = [
    (X_train_reduced, X_test_reduced, y_train[emotion], y_test[emotion])
    for emotion in Y_val_names
]

def create_bins(y, n_bins=5):
    return pd.qcut(y, q=n_bins, labels=False, duplicates='drop')

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=207026618)

def normalized_rmse(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    return rmse / np.std(y_true)

def find_best_model(X_train, X_test, y_train, y_test, emotion):
    model, param_grid = get_model()
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    baseline_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    # Create bins for StratifiedKFold
    y_bins = create_bins(y_train)
    
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, 
                               cv=skf.split(X_train, y_bins), n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)
    
    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_
    
    # Get Lasso coefficients for PCA components
    lasso_coeffs = best_model.coef_
    
    # Calculate importance of original features
    feature_importance = np.abs(np.dot(pca_loadings, lasso_coeffs))
    
    # Create DataFrame of feature importances
    feature_importances = pd.DataFrame({
        'Feature': x_data.columns,
        'Importance': feature_importance
    })
    
    # Sort by importance
    feature_importances = feature_importances.sort_values('Importance', ascending=False)
    
    # Save feature importances
    feature_importances.to_csv(os.path.join(SAVE_MODEL_LOCATION, 'feature_importances', f'{emotion}_feature_importances.csv'), index=False)
    
    y_pred_best = best_model.predict(X_test)
    optimized_rmse = np.sqrt(mean_squared_error(y_test, y_pred_best))
    normalized_rmse_value = normalized_rmse(y_test, y_pred_best)


    
    # Get feature coefficients
    feature_coeffs = pd.DataFrame({
        'Feature': X_train.columns,
        'Coefficient': best_model.coef_
    })
    feature_coeffs.to_csv(os.path.join(SAVE_MODEL_LOCATION, 'coefficients', f'{emotion}_feature_coefficients.csv'), index=False)
    
    
    # Save model using pickle
    with open(os.path.join(SAVE_MODEL_LOCATION, 'pkl', f'{emotion}_model.pkl'), 'wb') as f:
        pickle.dump(best_model, f)
    
    return best_model, grid_search, emotion, baseline_rmse, optimized_rmse, normalized_rmse_value, best_params['alpha'], y_pred_best

def create_all_models(data_split_list, emotions):
    start_time = time.time()
    best_models = []
    results = []
    
    for i, emotion in enumerate(emotions):
        print(f"EMOTION {emotion}")
        best_emotion_model, grid_search, emotion, baseline_rmse, optimized_rmse, normalized_rmse_value, alpha, y_pred = find_best_model(
            X_train=data_split_list[i][0], 
            X_test=data_split_list[i][1],
            y_train=data_split_list[i][2], 
            y_test=data_split_list[i][3],
            emotion=emotion
        )
        best_models.append(best_emotion_model)
        results.append((emotion, optimized_rmse, normalized_rmse_value, alpha))
        
        # Save predictions
        pd.DataFrame({'Real': data_split_list[i][3], 'Predicted': y_pred}).to_csv(
            os.path.join(SAVE_MODEL_LOCATION, 'predictions', f'{emotion}_predictions.csv'), index=False
        )
        
        print(f"           {emotion}    COMPLETE                    ")
        print("=========================================================")
        
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"The process took {int(elapsed_time)} seconds.")
    print(f"Total of {len(emotions)} models created")
    
    results_df = pd.DataFrame(results, columns=["emotion", "RMSE", "Normalized RMSE", "alpha"])
    results_df.to_csv(os.path.join(SAVE_MODEL_LOCATION, 'results_summary.csv'), index=False)
    
    return best_models

# Creating all models
print("Creating models")
models_make = create_all_models(data_split_list=train_test_list, emotions=Y_val_names)
print("Completed models")
print("==================================================")

Number of components to explain 99.99% variance: 3146
Reduced train data shape: (3963, 3146)
Reduced test data shape: (991, 3146)
Creating models
EMOTION arousal
Fitting 5 folds for each of 50 candidates, totalling 250 fits
           arousal    COMPLETE                    
EMOTION valence
Fitting 5 folds for each of 50 candidates, totalling 250 fits
           valence    COMPLETE                    
EMOTION interest
Fitting 5 folds for each of 50 candidates, totalling 250 fits
           interest    COMPLETE                    
EMOTION despair
Fitting 5 folds for each of 50 candidates, totalling 250 fits
           despair    COMPLETE                    
EMOTION joy
Fitting 5 folds for each of 50 candidates, totalling 250 fits
           joy    COMPLETE                    
The process took 304 seconds.
Total of 5 models created
Completed models
