In [1]:
# Creating Dummy Variables for Machine Learning

# Imports
import pandas as pd

# Loading df
music_df = pd.read_csv('../../_datasets/music.csv')

# EDA
print(music_df.describe())

             age     gender
count  18.000000  18.000000
mean   27.944444   0.500000
std     5.127460   0.514496
min    20.000000   0.000000
25%    25.000000   0.000000
50%    28.000000   0.500000
75%    31.000000   1.000000
max    37.000000   1.000000


In [2]:
# EDA
print(music_df.info())
print('The shape of the DF: ', music_df.shape)

# Listing the type of categories that we have
print(music_df.value_counts('genre'))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18 entries, 0 to 17
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   age     18 non-null     int64 
 1   gender  18 non-null     int64 
 2   genre   18 non-null     object
dtypes: int64(2), object(1)
memory usage: 560.0+ bytes
None
The shape of the DF:  (18, 3)
genre
Classical    6
Acoustic     3
Dance        3
HipHop       3
Jazz         3
dtype: int64


In [3]:
# Create dummy variables using pandas method
# Drop_first is used to fix the issue of multicollinearity
# When creating dummy variables, we leave out one category to avoid the issue of multicollinearity, 
# which occurs when two or more independent variables are highly correlated with each other. 
# In other words, if we include all categories, the model will have perfect multicollinearity since the sum of the dummy variables will always equal one.
# Leaving out one category, which is often referred to as the reference category or the baseline category, 
# allows the model to estimate coefficients for each of the remaining categories relative to the baseline category. 
# This way, the model can capture the effects of each category while avoiding the issue of perfect multicollinearity. 
# The choice of which category to leave out is somewhat arbitrary and depends on the context and the research question being addressed.
music_dummies = pd.get_dummies(music_df['genre'], drop_first=True)


In [4]:
# Append to df
music_dum_df = pd.concat([music_df, music_dummies], axis=1)

# Drop the column we dont need. 
# Keep in mind that one category is 0'ed on (ie. Accoustic in this example).
music_dum_df = music_dum_df.drop('genre', axis=1)