# this notebook performs random forest regression on aggregate data of all acoustic scenarios (cg meal, cg play, child meal)\
for random forest regression on the entire acoustic data see a different notebook

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from numpy import mean
from numpy import std
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# first running random forest regression on the aggregate dataset - caregiver meal

In [None]:
# load the dataset as pandas DataFrame  
# read pandas from xlsx file

df = pd.read_excel('/Users/andrei-macpro/Documents/Data/classification/accoustics/extraction_3.0/cg_meal_prosody_regression.xlsx')

In [None]:
# from df remove columns "intensity_percentile_1_99_range"  
df = df.drop(columns=['intensity_percentile_1_99_range', 'loudness_slidingwindow_percentile_1_99_range', 
                      'intensity_percentile_1_99_range', 'rms_mean', 'rms_std','rms_percentile_1_99_range'])


In [None]:
# turn the dataset except the first column and last 2 columns into a numpy array 
X = df.iloc[:, 1:-2].values
s_id = df.iloc[:,0].values
dai = df.iloc[:,-2].values
rinab = df.iloc[:,-1].values

In [None]:
# run a random forest regression model on the dataset 
# define the model
model = RandomForestRegressor(random_state=0, n_jobs=1)
# shuffle the dataset
X, s_id, dai, rinab = shuffle(X, s_id, dai, rinab, random_state=0) 
# define the evaluation procedure
cv = KFold(n_splits=5, shuffle=True, random_state=0)
# evaluate the model using mean squared error, coefficient of determination, and explained variance score
scoring = ['neg_mean_squared_error', 'r2', 'explained_variance']
scores = cross_validate(model, X, rinab, scoring=scoring, cv=cv, n_jobs=-1)
# report performance
print('Mean MSE: %.3f (%.3f)' % (mean(scores['test_neg_mean_squared_error']), std(scores['test_neg_mean_squared_error'])))
print('Mean R^2: %.3f (%.3f)' % (mean(scores['test_r2']), std(scores['test_r2'])))
print('Mean EV: %.3f (%.3f)' % (mean(scores['test_explained_variance']), std(scores['test_explained_variance'])))



In [None]:
# shuffle the dataset
X, s_id, dai, rinab = shuffle(X, s_id, dai, rinab, random_state=0) 
# define the model
model = RandomForestRegressor(random_state=0, n_jobs=1)
# define the evaluation procedure
cv = KFold(n_splits=5, shuffle=True, random_state=0)
# evaluate the model using mean squared error, coefficient of determination, and explained variance score
scoring = ['neg_mean_squared_error', 'r2', 'explained_variance']
scores = cross_validate(model, X, rinab, scoring=scoring, cv=cv, n_jobs=1)
# report performance
print('Mean MSE: %.3f (%.3f)' % (mean(scores['test_neg_mean_squared_error']), std(scores['test_neg_mean_squared_error'])))
print('Mean R^2: %.3f (%.3f)' % (mean(scores['test_r2']), std(scores['test_r2'])))
print('Mean EV: %.3f (%.3f)' % (mean(scores['test_explained_variance']), std(scores['test_explained_variance'])))

In [None]:
from sklearn.model_selection import train_test_split

# split the dataset into training and testing sets
X_train, X_test, s_id_train, s_id_test, dai_train, dai_test, rinab_train, rinab_test = train_test_split(X, s_id, dai, rinab, test_size=0.2, random_state=0)

# define the model
model = RandomForestRegressor(random_state=0, n_jobs=1)

# define the evaluation procedure
cv = KFold(n_splits=5, shuffle=True, random_state=0)

# evaluate the model using mean squared error, coefficient of determination, and explained variance score
scoring = ['neg_mean_squared_error', 'r2', 'explained_variance']
scores = cross_validate(model, X_train, rinab_train, scoring=scoring, cv=cv, n_jobs=1)

# report performance
print('Mean MSE: %.3f (%.3f)' % (mean(scores['test_neg_mean_squared_error']), std(scores['test_neg_mean_squared_error'])))
print('Mean R^2: %.3f (%.3f)' % (mean(scores['test_r2']), std(scores['test_r2'])))
print('Mean EV: %.3f (%.3f)' % (mean(scores['test_explained_variance']), std(scores['test_explained_variance'])))

In [None]:
# explore why the r2 score is negative
# use regularization for over or underfitting
# scale the features 
# do feature selection

# feature scaling

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# split the dataset into training and testing sets
X_train, X_test, s_id_train, s_id_test, dai_train, dai_test, rinab_train, rinab_test = train_test_split(X, s_id, dai, rinab, test_size=0.2, random_state=0)

# perform feature scaling
## to note - the scaling is done on the training set and then applied to the test set - very good!
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# define the model
model = RandomForestRegressor(random_state=0, n_jobs=1)

# define the evaluation procedure
cv = KFold(n_splits=5, shuffle=True, random_state=0)

# evaluate the model using mean squared error, coefficient of determination, and explained variance score
scoring = ['neg_mean_squared_error', 'r2', 'explained_variance']
scores = cross_validate(model, X_train_scaled, rinab_train, scoring=scoring, cv=cv, n_jobs=1)

# report performance
print('Mean MSE: %.3f (%.3f)' % (mean(scores['test_neg_mean_squared_error']), std(scores['test_neg_mean_squared_error'])))
print('Mean R^2: %.3f (%.3f)' % (mean(scores['test_r2']), std(scores['test_r2'])))
print('Mean EV: %.3f (%.3f)' % (mean(scores['test_explained_variance']), std(scores['test_explained_variance'])))

# hypermarameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV

# define the model
model = RandomForestRegressor(random_state=0, n_jobs=1)

# define the hyperparameters to tune
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# define the evaluation procedure
cv = KFold(n_splits=5, shuffle=True, random_state=0)

# perform grid search to find the best hyperparameters
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='r2', cv=cv, n_jobs=1)
grid_search.fit(X_train_scaled, rinab_train)

# get the best hyperparameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# evaluate the best model using mean squared error, coefficient of determination, and explained variance score
scoring = ['neg_mean_squared_error', 'r2', 'explained_variance']
scores = cross_validate(best_model, X_train_scaled, rinab_train, scoring=scoring, cv=cv, n_jobs=1)

# report performance
print('Best Hyperparameters:', best_params)
print('Mean MSE: %.3f (%.3f)' % (mean(scores['test_neg_mean_squared_error']), std(scores['test_neg_mean_squared_error'])))
print('Mean R^2: %.3f (%.3f)' % (mean(scores['test_r2']), std(scores['test_r2'])))
print('Mean EV: %.3f (%.3f)' % (mean(scores['test_explained_variance']), std(scores['test_explained_variance'])))

𝑅2 can range from -∞ to 1 in the world of machine learning. The closer 𝑅2 is to 1, the better the model performance. There is a common misconception that 𝑅2 can only range from 0 to 1. As shown in the formula of 𝑅2 above, we can see that 𝑅2 can be less than 0. This situation happens when the MSE of our designed model is larger than the 32 MSE of the baseline model that only predicts the mean value, which is a horizontal line. Simply speaking, in the world of machine learning, when your model is doing worse than a horizontal line in fitting the data, 𝑅2 can be negative. 

Predictive modeling of webpage aesthetics

In [None]:
from sklearn.model_selection import GridSearchCV

# perform feature scaling
## to note - the scaling is done on the training set and then applied to the test set - very good!
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# define the model
model = RandomForestRegressor(random_state=0, n_jobs=1)

# define the hyperparameters to tune
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# define the evaluation procedure
cv = KFold(n_splits=5, shuffle=True, random_state=0)

# perform grid search to find the best hyperparameters
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='r2', cv=cv, n_jobs=1)
grid_search.fit(X_scaled, rinab)

# get the best hyperparameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# evaluate the best model using mean squared error, coefficient of determination, and explained variance score
scoring = ['neg_mean_squared_error', 'r2', 'explained_variance']
scores = cross_validate(best_model, X_scaled, rinab, scoring=scoring, cv=cv, n_jobs=1)

# report performance
print('Best Hyperparameters:', best_params)
print('Mean MSE: %.3f (%.3f)' % (mean(scores['test_neg_mean_squared_error']), std(scores['test_neg_mean_squared_error'])))
print('Mean R^2: %.3f (%.3f)' % (mean(scores['test_r2']), std(scores['test_r2'])))
print('Mean EV: %.3f (%.3f)' % (mean(scores['test_explained_variance']), std(scores['test_explained_variance'])))

## Results for Random Forest Regression (with 5-fold cross-validation, hyperparameter tuning) on Rinab in caregiver meal acoustic with participant aggregate data

mean (std) \

Best Hyperparameters: {'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200} \

Mean MSE: -41.168 (21.984) \

Mean R^2: -0.335 (0.192) \

Mean EV: -0.272 (0.230)

## Results for Random Forest Regression (with 5-fold cross-validation, hyperparameter tuning) on DAI in caregiver meal acoustic with participant aggregate data
mean(std) \
Best Hyperparameters: {'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 300} 

Mean MSE: -4.675 (1.157)\
Mean R^2: -0.616 (0.558)\
Mean EV: -0.316 (0.420)

In [None]:
from sklearn.model_selection import GridSearchCV

# perform feature scaling
## to note - the scaling is done on the training set and then applied to the test set - very good!
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# define the model
model = RandomForestRegressor(random_state=0, n_jobs=1)

# define the hyperparameters to tune
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# define the evaluation procedure
cv = KFold(n_splits=5, shuffle=True, random_state=0)

# perform grid search to find the best hyperparameters
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='r2', cv=cv, n_jobs=1)
grid_search.fit(X_scaled, dai)

# get the best hyperparameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# evaluate the best model using mean squared error, coefficient of determination, and explained variance score
scoring = ['neg_mean_squared_error', 'r2', 'explained_variance']
scores = cross_validate(best_model, X_scaled, dai, scoring=scoring, cv=cv, n_jobs=1)

# report performance
print('Best Hyperparameters:', best_params)
print('Mean MSE: %.3f (%.3f)' % (mean(scores['test_neg_mean_squared_error']), std(scores['test_neg_mean_squared_error'])))
print('Mean R^2: %.3f (%.3f)' % (mean(scores['test_r2']), std(scores['test_r2'])))
print('Mean EV: %.3f (%.3f)' % (mean(scores['test_explained_variance']), std(scores['test_explained_variance'])))

# now running the same code on the cg play dataset

In [None]:
# load the dataset as pandas DataFrame  
# read pandas from xlsx file

df = pd.read_csv('/Users/andrei-macpro/Documents/Data/classification/accoustics/extraction_3.0/cg_play_prosody_regression.csv')
# from df remove columns "intensity_percentile_1_99_range"  
df = df.drop(columns=['intensity_percentile_1_99_range', 'loudness_slidingwindow_percentile_1_99_range', 
                      'intensity_percentile_1_99_range', 'rms_mean', 'rms_std','rms_percentile_1_99_range'])

# turn the dataset except the first column and last 2 columns into a numpy array 
X = df.iloc[:, 1:-2].values
s_id = df.iloc[:,0].values
dai = df.iloc[:,-2].values
rinab = df.iloc[:,-1].values


In [None]:
from sklearn.model_selection import GridSearchCV

# perform feature scaling
## to note - the scaling is done on the training set and then applied to the test set - very good!
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# define the model
model = RandomForestRegressor(random_state=0, n_jobs=1)

# define the hyperparameters to tune
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# define the evaluation procedure
cv = KFold(n_splits=5, shuffle=True, random_state=0)

# perform grid search to find the best hyperparameters
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='r2', cv=cv, n_jobs=1)
grid_search.fit(X_scaled, dai)

# get the best hyperparameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# evaluate the best model using mean squared error, coefficient of determination, and explained variance score
scoring = ['neg_mean_squared_error', 'r2', 'explained_variance']
scores = cross_validate(best_model, X_scaled, dai, scoring=scoring, cv=cv, n_jobs=1)

# report performance
print('Best Hyperparameters:', best_params)
print('Mean MSE: %.3f (%.3f)' % (mean(scores['test_neg_mean_squared_error']), std(scores['test_neg_mean_squared_error'])))
print('Mean R^2: %.3f (%.3f)' % (mean(scores['test_r2']), std(scores['test_r2'])))
print('Mean EV: %.3f (%.3f)' % (mean(scores['test_explained_variance']), std(scores['test_explained_variance'])))

#

## Results for Random Forest Regression (with 5-fold cross-validation, hyperparameter tuning) on DAI in caregiver play acoustic with participant aggregate data\
Best Hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}\
Mean MSE: -3.809 (0.660)\
Mean R^2: -0.220 (0.391)\
Mean EV: -0.135 (0.301)



In [None]:
from sklearn.model_selection import GridSearchCV

# perform feature scaling
## to note - the scaling is done on the training set and then applied to the test set - very good!
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# define the model
model = RandomForestRegressor(random_state=0, n_jobs=1)

# define the hyperparameters to tune
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# define the evaluation procedure
cv = KFold(n_splits=5, shuffle=True, random_state=0)

# perform grid search to find the best hyperparameters
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='r2', cv=cv, n_jobs=1)
grid_search.fit(X_scaled, rinab)

# get the best hyperparameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# evaluate the best model using mean squared error, coefficient of determination, and explained variance score
scoring = ['neg_mean_squared_error', 'r2', 'explained_variance']
scores = cross_validate(best_model, X_scaled, rinab, scoring=scoring, cv=cv, n_jobs=1)

# report performance
print('Best Hyperparameters:', best_params)
print('Mean MSE: %.3f (%.3f)' % (mean(scores['test_neg_mean_squared_error']), std(scores['test_neg_mean_squared_error'])))
print('Mean R^2: %.3f (%.3f)' % (mean(scores['test_r2']), std(scores['test_r2'])))
print('Mean EV: %.3f (%.3f)' % (mean(scores['test_explained_variance']), std(scores['test_explained_variance'])))

## Results for Random Forest Regression (with 5-fold cross-validation, hyperparameter tuning) on Rinab in caregiver play acoustic with participant aggregate data\
Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 100}\
Mean MSE: -51.585 (21.752)\
Mean R^2: -0.360 (0.484)\
Mean EV: -0.193 (0.281)


# now running the same code on the child meal dataset