# Data loading and importing libraries

In [78]:
!pip install numpy pandas matplotlib seaborn scikit-learn

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt



In [79]:
music = pd.read_csv('/Users/amastikbayev/Desktop/useful for ds:da:ml/portfolio/music preferences/music_project_en.csv')
music.head()

Unnamed: 0,userID,Track,artist,genre,City,time,Day
0,FFB692EC,Kamigata To Boots,The Mass Missile,rock,Shelbyville,20:28:33,Wednesday
1,55204538,Delayed Because of Accident,Andreas Rönnberg,rock,Springfield,14:07:09,Friday
2,20EC38,Funiculì funiculà,Mario Lanza,pop,Shelbyville,20:58:07,Wednesday
3,A3DD03C9,Dragons in the Sunset,Fire + Ice,folk,Shelbyville,08:37:09,Monday
4,E2DC1FAE,Soul People,Space Echo,dance,Springfield,08:34:34,Monday


# Exploratory Data Analysis (EDA)

In [80]:
music.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65079 entries, 0 to 65078
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0     userID  65079 non-null  object
 1   Track     63736 non-null  object
 2   artist    57512 non-null  object
 3   genre     63881 non-null  object
 4     City    65079 non-null  object
 5   time      65079 non-null  object
 6   Day       65079 non-null  object
dtypes: object(7)
memory usage: 3.5+ MB


In [81]:
music.shape

(65079, 7)

In [82]:
music.describe()

Unnamed: 0,userID,Track,artist,genre,City,time,Day
count,65079,63736,57512,63881,65079,65079,65079
unique,41748,39666,37806,268,2,20392,3
top,A8AE9169,Brand,Kartvelli,pop,Springfield,08:14:07,Friday
freq,76,136,136,8850,45360,14,23149


# Data preprocessing

In [83]:
music.isna().sum() #check for missing values

  userID       0
Track       1343
artist      7567
genre       1198
  City         0
time           0
Day            0
dtype: int64

In [84]:
music = music.rename(columns= {
    '  userID': 'user_id',
    'Track': 'track',
    'Day': 'day',
    '  City  ': 'city'
})


In [85]:
columns_to_replace = ['track','artist','genre']
for column in columns_to_replace:
    music[column] = music[column].fillna('unknown')

In [86]:
music.columns
music.head()

Unnamed: 0,user_id,track,artist,genre,city,time,day
0,FFB692EC,Kamigata To Boots,The Mass Missile,rock,Shelbyville,20:28:33,Wednesday
1,55204538,Delayed Because of Accident,Andreas Rönnberg,rock,Springfield,14:07:09,Friday
2,20EC38,Funiculì funiculà,Mario Lanza,pop,Shelbyville,20:58:07,Wednesday
3,A3DD03C9,Dragons in the Sunset,Fire + Ice,folk,Shelbyville,08:37:09,Monday
4,E2DC1FAE,Soul People,Space Echo,dance,Springfield,08:34:34,Monday


In [87]:
music.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65079 entries, 0 to 65078
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   user_id  65079 non-null  object
 1   track    65079 non-null  object
 2   artist   65079 non-null  object
 3   genre    65079 non-null  object
 4   city     65079 non-null  object
 5   time     65079 non-null  object
 6   day      65079 non-null  object
dtypes: object(7)
memory usage: 3.5+ MB


In [88]:
music.isna().sum() #check for missing values again after filling

user_id    0
track      0
artist     0
genre      0
city       0
time       0
day        0
dtype: int64

In [89]:
music.duplicated().sum() # check for duplicates

3826

In [90]:
music = music.drop_duplicates().reset_index(drop=True) # drop duplicates and reset index

In [91]:
music.duplicated().sum() # Check for duplicates again after dropping them

0

In [92]:
sorted(music['genre'].dropna().unique()) # This will show unique genres in sorted order, excluding NaN

['acid',
 'acoustic',
 'action',
 'adult',
 'africa',
 'afrikaans',
 'alternative',
 'ambient',
 'americana',
 'animated',
 'anime',
 'arabesk',
 'arabic',
 'arena',
 'argentinetango',
 'art',
 'audiobook',
 'avantgarde',
 'axé',
 'baile',
 'balkan',
 'beats',
 'bigroom',
 'black',
 'bluegrass',
 'blues',
 'bollywood',
 'bossa',
 'brazilian',
 'breakbeat',
 'breaks',
 'broadway',
 'cantautori',
 'cantopop',
 'canzone',
 'caribbean',
 'caucasian',
 'celtic',
 'chamber',
 'children',
 'chill',
 'chinese',
 'choral',
 'christian',
 'christmas',
 'classical',
 'classicmetal',
 'club',
 'colombian',
 'comedy',
 'conjazz',
 'contemporary',
 'country',
 'cuban',
 'dance',
 'dancehall',
 'dancepop',
 'dark',
 'death',
 'deep',
 'deutschrock',
 'deutschspr',
 'dirty',
 'disco',
 'dnb',
 'documentary',
 'downbeat',
 'downtempo',
 'drum',
 'dub',
 'dubstep',
 'eastern',
 'easy',
 'electronic',
 'electropop',
 'emo',
 'entehno',
 'epicmetal',
 'estrada',
 'ethnic',
 'eurofolk',
 'european',
 'expe

We see implicit duplicates of the name hiphop. These could be misspellings or alternative names for the same genre:

hip,
hop,
hip-hop.

In [93]:
wrong = ['hip','hop','hip-hop']
correct = 'hiphop'

music.genre.replace(wrong,correct,inplace=True) # replace implicit duplicates with correct name

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  music.genre.replace(wrong,correct,inplace=True) # replace implicit duplicates with correct name


In [94]:
sorted(music['genre'].dropna().unique()) # Check unique genres again after replacement

['acid',
 'acoustic',
 'action',
 'adult',
 'africa',
 'afrikaans',
 'alternative',
 'ambient',
 'americana',
 'animated',
 'anime',
 'arabesk',
 'arabic',
 'arena',
 'argentinetango',
 'art',
 'audiobook',
 'avantgarde',
 'axé',
 'baile',
 'balkan',
 'beats',
 'bigroom',
 'black',
 'bluegrass',
 'blues',
 'bollywood',
 'bossa',
 'brazilian',
 'breakbeat',
 'breaks',
 'broadway',
 'cantautori',
 'cantopop',
 'canzone',
 'caribbean',
 'caucasian',
 'celtic',
 'chamber',
 'children',
 'chill',
 'chinese',
 'choral',
 'christian',
 'christmas',
 'classical',
 'classicmetal',
 'club',
 'colombian',
 'comedy',
 'conjazz',
 'contemporary',
 'country',
 'cuban',
 'dance',
 'dancehall',
 'dancepop',
 'dark',
 'death',
 'deep',
 'deutschrock',
 'deutschspr',
 'dirty',
 'disco',
 'dnb',
 'documentary',
 'downbeat',
 'downtempo',
 'drum',
 'dub',
 'dubstep',
 'eastern',
 'easy',
 'electronic',
 'electropop',
 'emo',
 'entehno',
 'epicmetal',
 'estrada',
 'ethnic',
 'eurofolk',
 'european',
 'expe

**Problems encountered:**

1. unstandardised column names

2. missing values with NULL

3. duplicates

4. misspelled and misswritten values that can affect the analysis

**Data preprocessing and what has been done so far:**

1. names of the columns were standardised for the ease of use

2. duplicates removed (3826 rows)

3. missing values were changed from NULL to 'unknown'

4. wrong data has been standardised to one for the more accurate analyis (hip, hop, hip-hop changed to hip-hop)

# Hypothesis analysis

*Hypothesis 1: comparison of the user behaviour in 2 cities*

this hypothesis says users listen music differently in these 2 cities

In [95]:
city_groupping = music.groupby('city')[['user_id']].count()
city_groupping

Unnamed: 0_level_0,user_id
city,Unnamed: 1_level_1
Shelbyville,18512
Springfield,42741


In [99]:
day_groupping = music.groupby('day')['track'].count()
day_groupping

day
Friday       21840
Monday       21354
Wednesday    18059
Name: track, dtype: int64

In [100]:
city_day_groupping = music.pivot_table(index='city', columns='day', values='track', aggfunc='count')
city_day_groupping

day,Friday,Monday,Wednesday
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Shelbyville,5895,5614,7003
Springfield,15945,15740,11056


So the hypothesis seems to be failed to be rejected since there is a difference in listening to music in weekdays
 
Shellbyvilles listen more on Wednesdays and approximately same on Fridays and Mondays

while Springfield citizens listen more music on Mondays and Fridays while less on Wednesdays

In [108]:
springfield_music = music[music['city'] == 'Springfield']

shelbyville_music = music[music['city'] == 'Shelbyville']

In [104]:
springfield_music.head()

Unnamed: 0,user_id,track,artist,genre,city,time,day
1,55204538,Delayed Because of Accident,Andreas Rönnberg,rock,Springfield,14:07:09,Friday
4,E2DC1FAE,Soul People,Space Echo,dance,Springfield,08:34:34,Monday
6,4CB90AA5,True,Roman Messer,dance,Springfield,13:00:07,Wednesday
7,F03E1C1F,Feeling This Way,Polina Griffith,dance,Springfield,20:47:49,Wednesday
8,8FA1D3BE,L’estate,Julia Dalia,ruspop,Springfield,09:17:40,Friday


In [109]:
shelbyville_music.head()

Unnamed: 0,user_id,track,artist,genre,city,time,day
0,FFB692EC,Kamigata To Boots,The Mass Missile,rock,Shelbyville,20:28:33,Wednesday
2,20EC38,Funiculì funiculà,Mario Lanza,pop,Shelbyville,20:58:07,Wednesday
3,A3DD03C9,Dragons in the Sunset,Fire + Ice,folk,Shelbyville,08:37:09,Monday
5,842029A1,Chains,Obladaet,rusrap,Shelbyville,13:09:41,Friday
9,E772D5C0,Pessimist,unknown,dance,Shelbyville,21:20:49,Wednesday


*Hypothesis 2: different date and time affects the music preference*

in particular in the mornings of Mondays and evenings of Fridays, users listen to different music based on the city

this hypothesis says users listen music differently in these 2 cities at different times 

In [115]:
def genre_weekday(music, d, time1, time2):

    genre_df = music.query(f"day == '{d}' and time >= '{time1}' and time <= '{time2}'")
    genre_df_grouped = genre_df.groupby(by='genre')['genre'].count().sort_values(ascending=False)

    return genre_df_grouped[:10] # Top 20 genres for the specified day and time range

In [118]:
genre_weekday(springfield_music, 'Monday', '06:00', '12:00')

genre
pop            781
dance          549
electronic     480
rock           474
hiphop         286
ruspop         186
world          181
rusrap         175
alternative    164
unknown        161
Name: genre, dtype: int64

In [119]:
genre_weekday(shelbyville_music, 'Monday', '06:00', '12:00')

genre
pop            218
dance          182
rock           162
electronic     147
hiphop          80
ruspop          64
alternative     58
rusrap          55
jazz            44
classical       40
Name: genre, dtype: int64

On monday mornings in both cities there is a similar ranking by genres, the only differences are in Springfield citizens listen to 'world' class music while in Shelbyville more classical and jazz genres

In [120]:
genre_weekday(springfield_music, 'Friday', '16:30', '23:00')

genre
pop            713
rock           517
dance          495
electronic     482
hiphop         273
world          208
ruspop         170
classical      163
alternative    163
rusrap         142
Name: genre, dtype: int64

In [121]:
genre_weekday(shellbyville_music, 'Friday', '16:30', '23:00')

genre
pop            256
rock           216
electronic     216
dance          210
hiphop          97
alternative     63
jazz            61
classical       60
rusrap          59
world           54
Name: genre, dtype: int64

Friday evenings don't change any picture, some genres change but the general figure is the same

so partially the hypothesis is true, users listen to similar music at the start and the end of the week

difference between 2 cities is not significant

'unknown' genres so missing values could change the image since there are 161 listenings that can potentially be the game-changer in the monday morning ranking of Springfield

*Hypothesis 3: Chellyvile is a rap city while Springfield is pop city*

I will research how many users lister to rap and pop music in both cities to conclude on the hypothesis 

In [127]:
springfield_music_genres = springfield_music.groupby('genre')['genre'].count().sort_values(ascending=False)
springfield_music_genres_percentage = (springfield_music_genres / springfield_music_genres.sum()) * 100
springfield_music_genres_percentage[:10]

genre
pop            13.785358
dance          10.376454
rock            9.276807
electronic      8.858005
hiphop          4.903956
classical       3.780913
world           3.350413
alternative     3.226410
ruspop          3.210033
rusrap          2.716361
Name: genre, dtype: float64

In [130]:
shelbyville_music_genres = shelbyville_music.groupby('genre')['genre'].count().sort_values(ascending=False)
shelbyville_music_genres_percentage = (shelbyville_music_genres / shelbyville_music_genres.sum())*100
shelbyville_music_genres_percentage[:10]

genre
pop            13.132022
dance          10.436474
rock           10.150173
electronic      9.377701
hiphop          5.185825
alternative     3.505834
classical       3.489628
rusrap          3.046672
ruspop          2.906223
world           2.781979
Name: genre, dtype: float64

Hypothesis is partly true because Springfield is in fact a pop city, while the false part is that Shelbyvillle is a pop city as well, and not a rap city at all

# Conclusions

I checked three hypothesis and found out that: 

1. day of the week differently affect the behavior of the users in 2 cities

hypothesis 1 is true

2. music genres do not change significantly throughout the week in both cities, some changes happen at the start and the end of the week

hypothesis 2 is partly true

3. both cities listen to similar music, both of them are pop cities

hypothesis 3 is rejected 