In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import joblib

In [None]:
df = pd.read_csv('mxmh_survey_results.csv')
df = df.rename({'Primary streaming service':'PrimaryStreamingService', 
                'Hours per day': 'HoursPerDay', 
                'While working': 'WhileWorking', 
                'Fav genre': 'FavGenre', 
                'Foreign languages': 'ForeignLanguages', 
                'Frequency [Classical]': 'FrequencyClassical',
                'Frequency [Country]': 'FrequencyCountry',
                'Frequency [EDM]': 'FrequencyEDM',
                'Frequency [Folk]': 'FrequencyFolk',
                'Frequency [Gospel]': 'FrequencyGospel',
                'Frequency [Hip hop]': 'FrequencyHipHop',
                'Frequency [Jazz]': 'FrequencyJazz',
                'Frequency [K pop]': 'FrequencyKPop',
                'Frequency [Latin]': 'FrequencyLatin',
                'Frequency [Lofi]': 'FrequencyLofi',
                'Frequency [Metal]': 'FrequencyMetal',
                'Frequency [Pop]': 'FrequencyPop',
                'Frequency [R&B]': 'FrequencyRNB',
                'Frequency [Rap]': 'FrequencyRap',
                'Frequency [Rock]': 'FrequencyRock',
                'Frequency [Video game music]': 'FrequencyVideoGameMusic',
                'Music effects': 'MusicEffects'}, axis=1)
df = df.dropna(subset=['MusicEffects', 'Age', 'PrimaryStreamingService', 'WhileWorking', 'Instrumentalist', 'ForeignLanguages'])
df = df.dropna(axis=1)
df = df.drop(['Timestamp', 'Permissions', 'MusicEffects', 'PrimaryStreamingService'], axis=1)

for i in df.columns:
    df[i] = df[i].apply(lambda x: 1 if x == 'Yes' else x)
    df[i] = df[i].apply(lambda x: 0 if x == 'No' else x)

enc = OneHotEncoder(handle_unknown='ignore')


y = df[['FrequencyClassical', 'FrequencyCountry', 
        'FrequencyEDM', 'FrequencyFolk', 
        'FrequencyGospel', 'FrequencyHipHop',
        'FrequencyJazz', 'FrequencyKPop', 
        'FrequencyLatin', 'FrequencyLofi',
        'FrequencyMetal', 'FrequencyPop',
        'FrequencyRNB', 'FrequencyRap',
        'FrequencyRock', 'FrequencyVideoGameMusic']]



X_sub = list(zip(df.FavGenre))
enc.fit(X_sub)
feature_vectors = enc.transform(X_sub).toarray()


X = df[['Age', 'HoursPerDay', 'WhileWorking','Instrumentalist','Composer', 'Exploratory', 'ForeignLanguages', 'Anxiety', 'Depression', 'Insomnia', 'OCD']]
X[['Classical', 'Country', 'EDM', 'Folk', 'Gospel', 'HipHop', 'Jazz',
       'KPop', 'Latin', 'Lofi', 'Metal', 'Pop', 'RNB', 'Rap', 'Rock',
       'VideoGameMusic']] = feature_vectors

display(X)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


model = RandomForestClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

y_pred = pd.DataFrame(y_pred, columns=['FrequencyClassical', 'FrequencyCountry', 
        'FrequencyEDM', 'FrequencyFolk', 
        'FrequencyGospel', 'FrequencyHipHop',
        'FrequencyJazz', 'FrequencyKPop', 
        'FrequencyLatin', 'FrequencyLofi',
        'FrequencyMetal', 'FrequencyPop',
        'FrequencyRNB', 'FrequencyRap',
        'FrequencyRock', 'FrequencyVideoGameMusic'])

label_encoder = LabelEncoder()
y_test_encoded = y_test.apply(label_encoder.fit_transform)
y_pred_encoded = y_pred.apply(label_encoder.transform)


f1_scores = []
precision_scores = []
recall_scores = []

for column in y_test.columns:
    
    p,r,f,s = precision_recall_fscore_support(y_test_encoded[column], y_pred_encoded[column], average=None, labels=[0, 1, 2, 3])
    print(column)
    print("F1:", f,"Precision:", p,"Recall:", r,"Support:", s)
    
    f1_scores.append(f)
    precision_scores.append(p)
    recall_scores.append(r)



In [None]:
def evaluate(X_train, X_test, y_train):
    model = RandomForestClassifier()
    model.fit(X_train, y_train)
    # joblib.dump(model, "music_predictor.joblib")
    y_pred = model.predict(X_test)
    y_pred = pd.DataFrame(y_pred, columns=['FrequencyClassical', 'FrequencyCountry', 
        'FrequencyEDM', 'FrequencyFolk', 
        'FrequencyGospel', 'FrequencyHipHop',
        'FrequencyJazz', 'FrequencyKPop', 
        'FrequencyLatin', 'FrequencyLofi',
        'FrequencyMetal', 'FrequencyPop',
        'FrequencyRNB', 'FrequencyRap',
        'FrequencyRock', 'FrequencyVideoGameMusic'])
    print("---------Predicted----------")
    display(y_pred)


In [None]:
# original
df = pd.read_csv('mxmh_survey_results.csv')
df = df.rename({'Primary streaming service':'PrimaryStreamingService', 
                'Hours per day': 'HoursPerDay', 
                'While working': 'WhileWorking', 
                'Fav genre': 'FavGenre', 
                'Foreign languages': 'ForeignLanguages', 
                'Frequency [Classical]': 'FrequencyClassical',
                'Frequency [Country]': 'FrequencyCountry',
                'Frequency [EDM]': 'FrequencyEDM',
                'Frequency [Folk]': 'FrequencyFolk',
                'Frequency [Gospel]': 'FrequencyGospel',
                'Frequency [Hip hop]': 'FrequencyHipHop',
                'Frequency [Jazz]': 'FrequencyJazz',
                'Frequency [K pop]': 'FrequencyKPop',
                'Frequency [Latin]': 'FrequencyLatin',
                'Frequency [Lofi]': 'FrequencyLofi',
                'Frequency [Metal]': 'FrequencyMetal',
                'Frequency [Pop]': 'FrequencyPop',
                'Frequency [R&B]': 'FrequencyRNB',
                'Frequency [Rap]': 'FrequencyRap',
                'Frequency [Rock]': 'FrequencyRock',
                'Frequency [Video game music]': 'FrequencyVideoGameMusic',
                'Music effects': 'MusicEffects'}, axis=1)
df = df.dropna(subset=['MusicEffects', 'Age', 'PrimaryStreamingService', 'WhileWorking', 'Instrumentalist', 'ForeignLanguages'])
df = df.dropna(axis=1)
df = df.drop(['Timestamp', 'Permissions', 'MusicEffects', 'PrimaryStreamingService'], axis=1)

for i in df.columns:
    df[i] = df[i].apply(lambda x: 1 if x == 'Yes' else x)
    df[i] = df[i].apply(lambda x: 0 if x == 'No' else x)

enc = OneHotEncoder(handle_unknown='ignore')


y = df[['FrequencyClassical', 'FrequencyCountry', 
        'FrequencyEDM', 'FrequencyFolk', 
        'FrequencyGospel', 'FrequencyHipHop',
        'FrequencyJazz', 'FrequencyKPop', 
        'FrequencyLatin', 'FrequencyLofi',
        'FrequencyMetal', 'FrequencyPop',
        'FrequencyRNB', 'FrequencyRap',
        'FrequencyRock', 'FrequencyVideoGameMusic']]



X_sub = list(zip(df.FavGenre))
enc.fit(X_sub)
feature_vectors = enc.transform(X_sub).toarray()


X = df[['Age', 'HoursPerDay', 'WhileWorking','Instrumentalist','Composer', 'Exploratory', 'ForeignLanguages', 'Anxiety', 'Depression', 'Insomnia', 'OCD']]

X[['Classical', 'Country', 'EDM', 'Folk', 'Gospel', 'HipHop', 'Jazz',
       'KPop', 'Latin', 'Lofi', 'Metal', 'Pop', 'RNB', 'Rap', 'Rock',
       'VideoGameMusic']] = feature_vectors
display(X)
display(y)


# Test
challenge = pd.DataFrame(data={'Age': [18.0], 'HoursPerDay': [3.0], 'WhileWorking': [1],'Instrumentalist':[1],'Composer':[0], 'Exploratory':[1], 'ForeignLanguages':[1], 'Anxiety': [4.0], 'Depression': [4.0], 'Insomnia': [3.0], 'OCD': [5.0], 'Classical': [0.0], 'Country': [0.0], 'EDM': [0.0], 'Folk': [0.0], 'Gospel': [0.0], 'HipHop': [1.0], 'Jazz': [0.0],
       'KPop': [0.0], 'Latin': [0.0], 'Lofi': [0.0], 'Metal': [0.0], 'Pop': [0.0], 'RNB': [0.0], 'Rap': [0.0], 'Rock': [0.0],
       'VideoGameMusic': [0.0]})

display(challenge)
evaluate(X, challenge, y)