### Exploration and Data Analysis

In [108]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv("../dataset/mxmh_survey_results.csv")
df.head()


Unnamed: 0,Timestamp,Age,Primary streaming service,Hours per day,While working,Instrumentalist,Composer,Fav genre,Exploratory,Foreign languages,...,Frequency [R&B],Frequency [Rap],Frequency [Rock],Frequency [Video game music],Anxiety,Depression,Insomnia,OCD,Music effects,Permissions
0,8/27/2022 19:29:02,18.0,Spotify,3.0,Yes,Yes,Yes,Latin,Yes,Yes,...,Sometimes,Very frequently,Never,Sometimes,3.0,0.0,1.0,0.0,,I understand.
1,8/27/2022 19:57:31,63.0,Pandora,1.5,Yes,No,No,Rock,Yes,No,...,Sometimes,Rarely,Very frequently,Rarely,7.0,2.0,2.0,1.0,,I understand.
2,8/27/2022 21:28:18,18.0,Spotify,4.0,No,No,No,Video game music,No,Yes,...,Never,Rarely,Rarely,Very frequently,7.0,7.0,10.0,2.0,No effect,I understand.
3,8/27/2022 21:40:40,61.0,YouTube Music,2.5,Yes,No,Yes,Jazz,Yes,Yes,...,Sometimes,Never,Never,Never,9.0,7.0,3.0,3.0,Improve,I understand.
4,8/27/2022 21:54:47,18.0,Spotify,4.0,Yes,No,No,R&B,Yes,No,...,Very frequently,Very frequently,Never,Rarely,7.0,2.0,5.0,9.0,Improve,I understand.


In [109]:
df.info()
df.columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 736 entries, 0 to 735
Data columns (total 33 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Timestamp                     736 non-null    object 
 1   Age                           735 non-null    float64
 2   Primary streaming service     735 non-null    object 
 3   Hours per day                 736 non-null    float64
 4   While working                 733 non-null    object 
 5   Instrumentalist               732 non-null    object 
 6   Composer                      735 non-null    object 
 7   Fav genre                     736 non-null    object 
 8   Exploratory                   736 non-null    object 
 9   Foreign languages             732 non-null    object 
 10  BPM                           629 non-null    float64
 11  Frequency [Classical]         736 non-null    object 
 12  Frequency [Country]           736 non-null    object 
 13  Frequ

Index(['Timestamp', 'Age', 'Primary streaming service', 'Hours per day',
       'While working', 'Instrumentalist', 'Composer', 'Fav genre',
       'Exploratory', 'Foreign languages', 'BPM', 'Frequency [Classical]',
       'Frequency [Country]', 'Frequency [EDM]', 'Frequency [Folk]',
       'Frequency [Gospel]', 'Frequency [Hip hop]', 'Frequency [Jazz]',
       'Frequency [K pop]', 'Frequency [Latin]', 'Frequency [Lofi]',
       'Frequency [Metal]', 'Frequency [Pop]', 'Frequency [R&B]',
       'Frequency [Rap]', 'Frequency [Rock]', 'Frequency [Video game music]',
       'Anxiety', 'Depression', 'Insomnia', 'OCD', 'Music effects',
       'Permissions'],
      dtype='object')

## Preprocessing 
<br>*(Drop irrelevant columns and manage missing and numerical values)*

In [110]:
DROP_COLS = [
    "Timestamp",
    "Primary streaming service",
    "Exploratory",
    "Foreign languages",
    "While working",
    "Instrumentalist",
    "Composer",
    "BPM",
    "Permissions"
]

df = df.drop(columns=[c for c in DROP_COLS if c in df.columns])

In [111]:
df.head()

Unnamed: 0,Age,Hours per day,Fav genre,Frequency [Classical],Frequency [Country],Frequency [EDM],Frequency [Folk],Frequency [Gospel],Frequency [Hip hop],Frequency [Jazz],...,Frequency [Pop],Frequency [R&B],Frequency [Rap],Frequency [Rock],Frequency [Video game music],Anxiety,Depression,Insomnia,OCD,Music effects
0,18.0,3.0,Latin,Rarely,Never,Rarely,Never,Never,Sometimes,Never,...,Very frequently,Sometimes,Very frequently,Never,Sometimes,3.0,0.0,1.0,0.0,
1,63.0,1.5,Rock,Sometimes,Never,Never,Rarely,Sometimes,Rarely,Very frequently,...,Sometimes,Sometimes,Rarely,Very frequently,Rarely,7.0,2.0,2.0,1.0,
2,18.0,4.0,Video game music,Never,Never,Very frequently,Never,Never,Rarely,Rarely,...,Rarely,Never,Rarely,Rarely,Very frequently,7.0,7.0,10.0,2.0,No effect
3,61.0,2.5,Jazz,Sometimes,Never,Never,Rarely,Sometimes,Never,Very frequently,...,Sometimes,Sometimes,Never,Never,Never,9.0,7.0,3.0,3.0,Improve
4,18.0,4.0,R&B,Never,Never,Rarely,Never,Rarely,Very frequently,Never,...,Sometimes,Very frequently,Very frequently,Never,Rarely,7.0,2.0,5.0,9.0,Improve


In [112]:
bins = [0, 13, 20, 30, 60, np.inf]
labels = ['Child', 'Teenager', 'Young Adult', 'Adult', 'Senior']

df['Age'] = pd.cut(df['Age'], bins=bins, labels=labels, right=False)


df.head()


Unnamed: 0,Age,Hours per day,Fav genre,Frequency [Classical],Frequency [Country],Frequency [EDM],Frequency [Folk],Frequency [Gospel],Frequency [Hip hop],Frequency [Jazz],...,Frequency [Pop],Frequency [R&B],Frequency [Rap],Frequency [Rock],Frequency [Video game music],Anxiety,Depression,Insomnia,OCD,Music effects
0,Teenager,3.0,Latin,Rarely,Never,Rarely,Never,Never,Sometimes,Never,...,Very frequently,Sometimes,Very frequently,Never,Sometimes,3.0,0.0,1.0,0.0,
1,Senior,1.5,Rock,Sometimes,Never,Never,Rarely,Sometimes,Rarely,Very frequently,...,Sometimes,Sometimes,Rarely,Very frequently,Rarely,7.0,2.0,2.0,1.0,
2,Teenager,4.0,Video game music,Never,Never,Very frequently,Never,Never,Rarely,Rarely,...,Rarely,Never,Rarely,Rarely,Very frequently,7.0,7.0,10.0,2.0,No effect
3,Senior,2.5,Jazz,Sometimes,Never,Never,Rarely,Sometimes,Never,Very frequently,...,Sometimes,Sometimes,Never,Never,Never,9.0,7.0,3.0,3.0,Improve
4,Teenager,4.0,R&B,Never,Never,Rarely,Never,Rarely,Very frequently,Never,...,Sometimes,Very frequently,Very frequently,Never,Rarely,7.0,2.0,5.0,9.0,Improve


In [113]:

MENTAL_COLS = ["Anxiety", "Depression", "Insomnia", "OCD"]


bins = [-np.inf, 3, 6, np.inf]
labels = ["Low", "Medium", "High"]

for col in MENTAL_COLS:
    df[col] = pd.cut(df[col], bins=bins, labels=labels)

In [114]:
df.head()

Unnamed: 0,Age,Hours per day,Fav genre,Frequency [Classical],Frequency [Country],Frequency [EDM],Frequency [Folk],Frequency [Gospel],Frequency [Hip hop],Frequency [Jazz],...,Frequency [Pop],Frequency [R&B],Frequency [Rap],Frequency [Rock],Frequency [Video game music],Anxiety,Depression,Insomnia,OCD,Music effects
0,Teenager,3.0,Latin,Rarely,Never,Rarely,Never,Never,Sometimes,Never,...,Very frequently,Sometimes,Very frequently,Never,Sometimes,Low,Low,Low,Low,
1,Senior,1.5,Rock,Sometimes,Never,Never,Rarely,Sometimes,Rarely,Very frequently,...,Sometimes,Sometimes,Rarely,Very frequently,Rarely,High,Low,Low,Low,
2,Teenager,4.0,Video game music,Never,Never,Very frequently,Never,Never,Rarely,Rarely,...,Rarely,Never,Rarely,Rarely,Very frequently,High,High,High,Low,No effect
3,Senior,2.5,Jazz,Sometimes,Never,Never,Rarely,Sometimes,Never,Very frequently,...,Sometimes,Sometimes,Never,Never,Never,High,High,Low,Low,Improve
4,Teenager,4.0,R&B,Never,Never,Rarely,Never,Rarely,Very frequently,Never,...,Sometimes,Very frequently,Very frequently,Never,Rarely,High,Low,Medium,High,Improve


### Checking for columns with null values

In [115]:
df.isna().sum().sort_values(ascending=False)

Music effects                   8
Age                             1
Fav genre                       0
Frequency [Classical]           0
OCD                             0
Insomnia                        0
Depression                      0
Anxiety                         0
Frequency [Video game music]    0
Frequency [Rock]                0
Frequency [Rap]                 0
Frequency [R&B]                 0
Frequency [Pop]                 0
Frequency [Metal]               0
Hours per day                   0
Frequency [Latin]               0
Frequency [K pop]               0
Frequency [Jazz]                0
Frequency [Hip hop]             0
Frequency [Gospel]              0
Frequency [Folk]                0
Frequency [EDM]                 0
Frequency [Country]             0
Frequency [Lofi]                0
dtype: int64

In [116]:
# check mode value in category with missing values
print(df['Music effects'].mode())

0    Improve
Name: Music effects, dtype: object


In [117]:
# fill data with missing music effect values with mode ("Improve")
df['Music effects'] = df['Music effects'].fillna(df['Music effects'].mode()[0])
df.head()

Unnamed: 0,Age,Hours per day,Fav genre,Frequency [Classical],Frequency [Country],Frequency [EDM],Frequency [Folk],Frequency [Gospel],Frequency [Hip hop],Frequency [Jazz],...,Frequency [Pop],Frequency [R&B],Frequency [Rap],Frequency [Rock],Frequency [Video game music],Anxiety,Depression,Insomnia,OCD,Music effects
0,Teenager,3.0,Latin,Rarely,Never,Rarely,Never,Never,Sometimes,Never,...,Very frequently,Sometimes,Very frequently,Never,Sometimes,Low,Low,Low,Low,Improve
1,Senior,1.5,Rock,Sometimes,Never,Never,Rarely,Sometimes,Rarely,Very frequently,...,Sometimes,Sometimes,Rarely,Very frequently,Rarely,High,Low,Low,Low,Improve
2,Teenager,4.0,Video game music,Never,Never,Very frequently,Never,Never,Rarely,Rarely,...,Rarely,Never,Rarely,Rarely,Very frequently,High,High,High,Low,No effect
3,Senior,2.5,Jazz,Sometimes,Never,Never,Rarely,Sometimes,Never,Very frequently,...,Sometimes,Sometimes,Never,Never,Never,High,High,Low,Low,Improve
4,Teenager,4.0,R&B,Never,Never,Rarely,Never,Rarely,Very frequently,Never,...,Sometimes,Very frequently,Very frequently,Never,Rarely,High,Low,Medium,High,Improve


In [118]:
# bin listening hours per day to 3 categories
bins = [-np.inf, 2, 4, np.inf]
labels = ['Low Hours', 'Medium Hours', 'High Hours']
df['Hours per day'] = pd.cut(df['Hours per day'], bins=bins, labels=labels, right=False)


df.head()

Unnamed: 0,Age,Hours per day,Fav genre,Frequency [Classical],Frequency [Country],Frequency [EDM],Frequency [Folk],Frequency [Gospel],Frequency [Hip hop],Frequency [Jazz],...,Frequency [Pop],Frequency [R&B],Frequency [Rap],Frequency [Rock],Frequency [Video game music],Anxiety,Depression,Insomnia,OCD,Music effects
0,Teenager,Medium Hours,Latin,Rarely,Never,Rarely,Never,Never,Sometimes,Never,...,Very frequently,Sometimes,Very frequently,Never,Sometimes,Low,Low,Low,Low,Improve
1,Senior,Low Hours,Rock,Sometimes,Never,Never,Rarely,Sometimes,Rarely,Very frequently,...,Sometimes,Sometimes,Rarely,Very frequently,Rarely,High,Low,Low,Low,Improve
2,Teenager,High Hours,Video game music,Never,Never,Very frequently,Never,Never,Rarely,Rarely,...,Rarely,Never,Rarely,Rarely,Very frequently,High,High,High,Low,No effect
3,Senior,Medium Hours,Jazz,Sometimes,Never,Never,Rarely,Sometimes,Never,Very frequently,...,Sometimes,Sometimes,Never,Never,Never,High,High,Low,Low,Improve
4,Teenager,High Hours,R&B,Never,Never,Rarely,Never,Rarely,Very frequently,Never,...,Sometimes,Very frequently,Very frequently,Never,Rarely,High,Low,Medium,High,Improve


In [119]:
# reduce fave genre to only top 8 + 'Other'

top_genres = df["Fav genre"].value_counts().nlargest(8).index

df["Fav_genre_clean"] = df["Fav genre"].apply(
    lambda x: x if x in top_genres else "Other"
)

df = df.drop(columns=["Fav genre"])

df.head()

Unnamed: 0,Age,Hours per day,Frequency [Classical],Frequency [Country],Frequency [EDM],Frequency [Folk],Frequency [Gospel],Frequency [Hip hop],Frequency [Jazz],Frequency [K pop],...,Frequency [R&B],Frequency [Rap],Frequency [Rock],Frequency [Video game music],Anxiety,Depression,Insomnia,OCD,Music effects,Fav_genre_clean
0,Teenager,Medium Hours,Rarely,Never,Rarely,Never,Never,Sometimes,Never,Very frequently,...,Sometimes,Very frequently,Never,Sometimes,Low,Low,Low,Low,Improve,Other
1,Senior,Low Hours,Sometimes,Never,Never,Rarely,Sometimes,Rarely,Very frequently,Rarely,...,Sometimes,Rarely,Very frequently,Rarely,High,Low,Low,Low,Improve,Rock
2,Teenager,High Hours,Never,Never,Very frequently,Never,Never,Rarely,Rarely,Very frequently,...,Never,Rarely,Rarely,Very frequently,High,High,High,Low,No effect,Video game music
3,Senior,Medium Hours,Sometimes,Never,Never,Rarely,Sometimes,Never,Very frequently,Sometimes,...,Sometimes,Never,Never,Never,High,High,Low,Low,Improve,Other
4,Teenager,High Hours,Never,Never,Rarely,Never,Rarely,Very frequently,Never,Very frequently,...,Very frequently,Very frequently,Never,Rarely,High,Low,Medium,High,Improve,R&B


In [120]:
FREQ_COLS = [c for c in df.columns if c.startswith("Frequency [")]

def listens(x):
    return x in ["Sometimes", "Very frequently"]

for col in FREQ_COLS:
    df[col] = df[col].apply(listens)


In [121]:
df.head()

Unnamed: 0,Age,Hours per day,Frequency [Classical],Frequency [Country],Frequency [EDM],Frequency [Folk],Frequency [Gospel],Frequency [Hip hop],Frequency [Jazz],Frequency [K pop],...,Frequency [R&B],Frequency [Rap],Frequency [Rock],Frequency [Video game music],Anxiety,Depression,Insomnia,OCD,Music effects,Fav_genre_clean
0,Teenager,Medium Hours,False,False,False,False,False,True,False,True,...,True,True,False,True,Low,Low,Low,Low,Improve,Other
1,Senior,Low Hours,True,False,False,False,True,False,True,False,...,True,False,True,False,High,Low,Low,Low,Improve,Rock
2,Teenager,High Hours,False,False,True,False,False,False,False,True,...,False,False,False,True,High,High,High,Low,No effect,Video game music
3,Senior,Medium Hours,True,False,False,False,True,False,True,True,...,True,False,False,False,High,High,Low,Low,Improve,Other
4,Teenager,High Hours,False,False,False,False,False,True,False,True,...,True,True,False,False,High,Low,Medium,High,Improve,R&B


In [None]:
transactions = []

for _, row in df.iterrows():
    transaction = []

    transaction.append(f"FavGenre_{row['Fav_genre_clean']}")
    transaction.append(f"Listening_{row['Hours per day']}")

    for col in MENTAL_COLS:
        transaction.append(f"{col}_{row[col]}")


    transaction.append(f"MusicEffect_{row['Music effects']}")

    for col in FREQ_COLS:
        if row[col]:
            genre = col.replace("Frequency [", "").replace("]", "")
            transaction.append(f"Listens_{genre}")

    transactions.append(transaction)


In [129]:
pd.DataFrame({"transaction": transactions}).to_csv(
    "../dataset/transactions.csv",
    index=False
)