# this notebook performs random forest regression on disaggregate data of all acoustic scenarios (cg meal, cg play, child meal)

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from numpy import mean
from numpy import std
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from tqdm import tqdm

## first on cg meal acoustic disagg dataset 

## explore the dataset and attach the continuous measures 


In [None]:
os.chdir('/Users/andrei-macpro/Documents/Data/Accoustics/extraction_3.0/meal_child_prosody/')
file_names=sorted([i for i in os.listdir(".") if not i.startswith(".")])
dfs = []

for file in file_names:
    dfs.append(pd.read_csv(file, index_col=0))


In [None]:
df = pd.concat([df for df in dfs])


In [None]:
df = df.drop(columns=['intensity_percentile_1_99_range', 'loudness_slidingwindow_percentile_1_99_range', 
                      'intensity_percentile_1_99_range', 'rms_mean', 'rms_std','rms_percentile_1_99_range',
                      'f0_contour_percentile_1_99_range' ])

In [None]:
# get the RAD measures from another file but make sure it has the subject ID's 
measures = pd.read_csv('/Users/andrei-macpro/Documents/Data/classification/accoustics/extraction_3.0/child_meal_prosody_regression.csv', index_col=0)

In [None]:
measures.index, df.index, measures.columns

In [None]:
# add the columns 'DAI' and "Rinab" from the measures dataframe to the df dataframe
df['DAI'] = measures['DAI']
df['Rinab'] = measures['Rinab']
df = df.dropna()

In [None]:
# save df to a csv file
df.to_csv('/Users/andrei-macpro/Documents/Data/classification/accoustics/extraction_3.0/child_meal_prosody_regression_disag.csv')

In [None]:
measures

In [None]:
X = df.iloc[:, 1:-2].values
s_id = df.index.values
dai = df.iloc[:,-2].values
rinab = df.iloc[:,-1].values

In [None]:
len(s_id) == len(rinab)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# perform feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# define the model
model = RandomForestRegressor(random_state=0, n_jobs=-1)

# define the hyperparameters to tune
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# define the evaluation procedure
cv = GroupKFold(n_splits=5)

# perform randomized search to find the best hyperparameters
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_grid, scoring='r2', cv=cv, n_jobs=-1, n_iter=10, verbose=3)

random_search.fit(X_scaled, rinab, groups=s_id)

# get the best hyperparameters and model
best_params = random_search.best_params_
best_model = random_search.best_estimator_

# evaluate the best model using mean squared error, coefficient of determination, and explained variance score
scoring = ['neg_mean_squared_error', 'r2', 'explained_variance']
scores = cross_validate(best_model, X_scaled, rinab, scoring=scoring, groups=s_id, cv=cv, n_jobs=-1)

# report performance
print('Best Hyperparameters:', best_params)
print('Mean MSE: %.3f (%.3f)' % (mean(scores['test_neg_mean_squared_error']), std(scores['test_neg_mean_squared_error'])))
print('Mean R^2: %.3f (%.3f)' % (mean(scores['test_r2']), std(scores['test_r2'])))
print('Mean EV: %.3f (%.3f)' % (mean(scores['test_explained_variance']), std(scores['test_explained_variance'])))

# CG Play Rinab
Best Hyperparameters: {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 5}\
Mean MSE: -63.604 (53.054)\
Mean R^2: -1.339 (1.223)\
Mean EV: -0.505 (0.531)

# CG Play DAI
Best Hyperparameters: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': 5}\
Mean MSE: -4.863 (2.211)\
Mean R^2: -0.714 (0.609)\
Mean EV: -0.150 (0.283)

# CG MEAL DAI
Best Hyperparameters: {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': None}\
Mean MSE: -5.601 (3.678)\
Mean R^2: -1.015 (1.295)\
Mean EV: -0.384 (0.754)

# CG MEAL RINAB
Best Hyperparameters: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_depth': 5}\
Mean MSE: -50.980 (55.199)\
Mean R^2: -0.397 (0.076)\
Mean EV: -0.176 (0.138)

# Child Meal RINAB
Best Hyperparameters: {'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 5}\
Mean MSE: -9.262 (6.812)\
Mean R^2: -0.331 (0.337)\
Mean EV: -0.231 (0.205)

# Child Meal DAI
Best Hyperparameters: {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_depth': 5}\
Mean MSE: -2.661 (0.918)\
Mean R^2: -0.511 (0.643)\
Mean EV: -0.128 (0.130)