In [8]:
import pandas as pd
#Importing the pandas library to work with dataframes

#Reading in Json Files to pd dataframe
df = pd.read_json('StreamingHistory0.json')

df.reset_index(inplace=True)
#Resetting the index to make it easier to work with, this operation adds a new index column and modifies...
#The original dataframe without creating a new one.

df.rename(columns={'index': 'Index'}, inplace=True)
#Renaming the newly created index column to 'Index' for clarity

df = df.rename(columns={'endtime' : 'timeEnded','artistName': 'Artist', 'trackName': 'Track', 'msPlayed': 'Duration'})
#Renaming the columns to make it easier to read and understand

df = df[~((df['Artist'] == 'Unknown Artist') & (df['Track'] == 'Unknown Track'))]
#Removing rows where the artist and track are unknown, the ~ symbol is used to negate the condition
#so rows that do not meet the condition are kept.

pd.set_option('display.max_colwidth', 100)
#Setting the maximum column width to 100 to make it easier to read the data
print(df.head(10))
#Printing the first 10 rows of the dataframe to see the data

    Index           endTime                      Artist  \
22     22  2022-09-05 15:26                    Lil Baby   
23     23  2022-09-05 15:28                      DaBaby   
24     24  2022-09-05 15:31                      Offset   
25     25  2022-09-05 15:34  YoungBoy Never Broke Again   
26     26  2022-09-05 15:37                  Gucci Mane   
27     27  2022-09-05 15:39                   Lil Tecca   
28     28  2022-09-05 15:41                    Lil Durk   
29     29  2022-09-05 15:44                  Nardo Wick   
30     30  2022-09-05 15:46                       Quavo   
31     31  2022-09-05 15:50                    Rod Wave   

                                Track  Duration  
22                              Detox      1557  
23                           WAITRESS    132018  
24          CODE (feat. Moneybagg Yo)    192427  
25   Home Ain't Home (feat. Rod Wave)    141166  
26                    Dissin the Dead    228922  
27                            Treesha    105693  


In [10]:
# Calculate the total duration for each artist
artist_durations = df.groupby('Artist')['Duration'].sum()

#top 10 artists
top_10_artists = artist_durations.sort_values(ascending=False).head(10)
print(top_10_artists)

Artist
Zach Bryan                       162480431
Jason Isbell                      78288106
Morgan Wallen                     43633966
Eminem                            43325520
Jason Isbell and the 400 Unit     36601313
                                   ...    
Gordon Lightfoot                   2023379
Lupe Fiasco                        1986352
Bob Dylan                          1923025
Barry Gibb                         1900527
Method Man                         1890361
Name: Duration, Length: 100, dtype: int64


In [11]:
# Calculate the total duration for each artist
artist_durations = df.groupby('Artist')['Duration'].sum()

# Find the top 10 artists with the longest total durations
top_10_artists = artist_durations.nlargest(10)

print(top_10_artists)

Artist
Zach Bryan                       162480431
Jason Isbell                      78288106
Morgan Wallen                     43633966
Eminem                            43325520
Jason Isbell and the 400 Unit     36601313
Glen Campbell                     24883597
Hank Williams, Jr.                24265528
Jimmy Buffett                     21103990
The White Stripes                 19268186
Tyler Childers                    18486320
Name: Duration, dtype: int64


In [5]:
# Convert duration from milliseconds to seconds
df['Duration'] = df['Duration'] / 1000

# Convert duration from seconds to hours
df['Duration'] = df['Duration'] / 3600

# If you want to convert the duration to a time format (hours:minutes:seconds), you can do this:
df['Duration'] = pd.to_timedelta(df['Duration'], unit='h')

# Now, when you calculate the total duration for each artist and find the top 10, the durations will be in hours and minutes
artist_durations = df.groupby('Artist')['Duration'].sum()
top_10_artists = artist_durations.sort_values(ascending=False).head(10)

print(top_10_artists)

Artist
Zach Bryan                      1 days 21:08:00.430999478
Jason Isbell                    0 days 21:44:48.105999877
Morgan Wallen                   0 days 12:07:13.965999886
Eminem                          0 days 12:02:05.519999939
Jason Isbell and the 400 Unit   0 days 10:10:01.312999909
Glen Campbell                   0 days 06:54:43.596999891
Hank Williams, Jr.              0 days 06:44:25.527999904
Jimmy Buffett                   0 days 05:51:43.989999993
The White Stripes               0 days 05:21:08.185999937
Tyler Childers                  0 days 05:08:06.319999937
Name: Duration, dtype: timedelta64[ns]


In [16]:
# Group by 'Track' and sum 'Duration' for each track
track_durations = df.groupby('Track')['Duration'].sum()

# Sort the durations in descending order
sorted_tracks = track_durations.sort_values(ascending=False)

# Get the most popular track (the first one after sorting)
most_popular_track = sorted_tracks.index[0]
most_popular_duration = sorted_tracks.iloc[0]

# Convert milliseconds to minutes and hours
total_seconds = most_popular_duration / 1000
total_minutes = total_seconds / 60
hours = int(total_minutes // 60)
minutes = int(total_minutes % 60)

print(f"The most popular track is '{most_popular_track}' with a total duration of {hours} hours and {minutes} minutes.")


The most popular track is 'Cover Me Up' with a total duration of 3 hours and 4 minutes.


In [38]:
#These are songs I "turned off after the first minute " or "skipped after the first minute"
# Step 1: Filter out rows where 'Duration' is NaN or <= 0
df_filtered_positive = df[(df['Duration'].notna()) & (df['Duration'] >= 60000)]

# Step 2: Group by 'Artist' and 'Track' to sum the 'Duration' for each track
track_durations_filtered_positive = df_filtered_positive.groupby(['Artist', 'Track'])['Duration'].sum()

# Step 3: Sort the durations in ascending order
sorted_tracks_filtered_positive = track_durations_filtered_positive.sort_values(ascending=True)

# Step 4: Select the first 10 tracks
top_10_least_popular_tracks = sorted_tracks_filtered_positive.head(10)

# Display the top 10 least popular tracks
for (artist, track), duration in top_10_least_popular_tracks.items():
    total_seconds = duration / 1000
    total_minutes = total_seconds / 60
    hours = int(total_minutes // 60)
    minutes = int(total_minutes % 60)
    print(f"'{track}' by '{artist}' with a total duration of {hours} hours and {minutes} minutes.")

'Certified (feat. Gunna)' by 'Pooh Shiesty' with a total duration of 0 hours and 1 minutes.
'circle the drain' by 'Soccer Mommy' with a total duration of 0 hours and 1 minutes.
'Ken Kaniff - Skit' by 'Eminem' with a total duration of 0 hours and 1 minutes.
'It's a Man's, Man’s, Man’s World' by 'Jason Isbell and the 400 Unit' with a total duration of 0 hours and 1 minutes.
'Don't It Make You Wanna Dance' by 'Rusty Wier' with a total duration of 0 hours and 1 minutes.
'This CD Is in Stores' by 'Mitch Hedberg' with a total duration of 0 hours and 1 minutes.
'Tayla (every night)' by 'Fred again..' with a total duration of 0 hours and 1 minutes.
'Don't Carry It All' by 'The Decemberists' with a total duration of 0 hours and 1 minutes.
'5% TINT' by 'Travis Scott' with a total duration of 0 hours and 1 minutes.
'Pure Water (with Migos)' by 'Mustard' with a total duration of 0 hours and 1 minutes.
