In [None]:
# Import libraries
import pandas as pd
import numpy as np

# Load and prep data
# Frequency response data
frequency_data = pd.read_json('../../data/headphones-fr-data.json')
sample_fr_header = frequency_data['header'][0]
print(sample_fr_header)
frequency_data = frequency_data.drop(columns=['header'])

# Filter frequency response data to only include frequencies between 20Hz and 9000Hz
def filter_frequency_data(data):
    return [point for point in data if 20 <= point[0] <= 9000]

frequency_data['data'] = frequency_data['data'].apply(filter_frequency_data)
frequency_data = frequency_data.sample(frac=1).reset_index(drop=True)
frequency_data.head()

In [None]:
# Scores data
scores_data = pd.read_json('../../data/headphones-data.json')
scores_data.replace(r'^\s*$', np.nan, regex=True, inplace=True)
scores_data = scores_data.dropna()
scores_data = scores_data.sample(frac=1).reset_index(drop=True)
scores_data.head()

In [None]:
# Merge 
merged_data = pd.merge(frequency_data, scores_data, on='id')
merged_data = merged_data.drop(columns=['fullname_x', 'bassAccuracyDescription', 'midAccuracyDescription', 'trebleAccuracyDescription'])
merged_data = merged_data.rename(columns={'fullname_y': 'fullname'})
merged_data.head()

In [None]:
# Shared functions 
def preprocess_frequency_response(data):
    max_length = max(len(np.array(item).flatten()) for item in data)
    frequency_responses = np.array([np.pad(np.array(item).flatten(), (0, max_length - len(np.array(item).flatten())), 'constant') for item in data])
    return frequency_responses

def get_prediction_by_id(test_ids, predictions, actual_scores, headphone_id):
    idx = np.where(test_ids == headphone_id)[0]
    if len(idx) == 0:
        return None, None
    idx = idx[0]
    return predictions[idx], actual_scores[idx]

def print_scores(predicted_scores, actual_scores, headphone_id):
    if predicted_scores is not None:
        print(f"Predicted scores for headphone ID {headphone_id}: {predicted_scores}")
        print(f"Actual scores for headphone ID {headphone_id}: {actual_scores}")
    else:
        print(f"Headphone ID {headphone_id} not found in the test set.")

In [None]:
# Shared variables
shared_X = preprocess_frequency_response(merged_data['data'])
shared_y = merged_data[['neutralSoundScore', 'bassAccuracyScore', 'midAccuracyScore', 'trebleAccuracyScore']].values
shared_ids = merged_data['id'].values

In [None]:
# Baseline model  with Random Forest Regressor
from statistics import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

X = shared_X 
y = shared_y 

X_train, X_test, y_train, y_test, id_train, id_test = train_test_split(X, y, shared_ids, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=100, random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')
print(id_test)

In [None]:
headphone_id = 762
predicted_scores, actual_scores = get_prediction_by_id(id_test, y_pred, y_test, headphone_id)
print_scores(predicted_scores, actual_scores, headphone_id)

In [None]:
# Next model with Scaler, Gradient Boosting Regressor wrapped in MultiOutputRegressor, and hyperparameter tuning with GridSearchCV
from sklearn.discriminant_analysis import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import GradientBoostingRegressor 
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.multioutput import MultiOutputRegressor

X = shared_X 
y = shared_y 

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test, id_train, id_test = train_test_split(X, y, shared_ids, test_size=0.2, random_state=42)

model = GradientBoostingRegressor(random_state=42)
multi_target_model = MultiOutputRegressor(model)

param_grid = {
    'estimator__n_estimators': [100, 200],
    'estimator__learning_rate': [0.01, 0.1],
    'estimator__max_depth': [3, 5],
    'estimator__subsample': [0.8, 1.0]
}
grid_search = GridSearchCV(estimator=multi_target_model, param_grid=param_grid, cv=5, n_jobs=-1, scoring='r2')
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')
print(id_test)

In [None]:
headphone_id = 751
predicted_scores, actual_scores = get_prediction_by_id(id_test, y_pred, y_test, headphone_id)
print_scores(predicted_scores, actual_scores, headphone_id)