In [None]:
import pandas as pd
import json

In [None]:
sh0 = pd.DataFrame(json.load(open('data/MyData/StreamingHistory0.json', encoding='utf-8')))
sh1 = pd.DataFrame(json.load(open('data/MyData/StreamingHistory1.json', encoding='utf-8')))
sh2 = pd.DataFrame(json.load(open('data/MyData/StreamingHistory2.json', encoding='utf-8')))

In [None]:
sh0.head()

In [None]:
sh1.head()

In [None]:
sh2.head()

In [None]:
stream_hist = pd.concat([sh0,sh1,sh2])

#### Basic eda without cleaning

In [None]:
# Print the first few rows of the DataFrame
print("First 5 rows:")
print(stream_hist.head())

# Get basic information about the DataFrame
print("\nDataFrame Info:")
print(stream_hist.info())

# Summary statistics of numerical columns
print("\nSummary Statistics:")
print(stream_hist.describe())

# Count the number of unique artists
num_artists = stream_hist['artistName'].nunique()
print("\nNumber of Unique Artists:", num_artists)

# Count the number of unique tracks
num_tracks = stream_hist['trackName'].nunique()
print("Number of Unique Tracks:", num_tracks)

# Total duration of listening in minutes
total_duration_min = stream_hist['msPlayed'].sum() / (1000 * 60)
print("Total Duration (minutes):", total_duration_min)

# Top 5 most listened artists
top_artists = stream_hist['artistName'].value_counts().head(5)
print("\nTop 5 Artists:")
print(top_artists)

# Top 5 most listened tracks
top_tracks = stream_hist['trackName'].value_counts().head(5)
print("\nTop 5 Tracks:")
print(top_tracks)

#### cleaning

In [None]:
# Filter out songs with duration <= 15 seconds
cleaned_df = stream_hist[stream_hist['msPlayed'] > 15000]

# Remove any duplicate entries based on artistName, trackName, and endTime
cleaned_df.drop_duplicates(subset=['artistName', 'trackName', 'endTime'], inplace=True)

# Remove any missing values (NaNs) if present in the DataFrame
cleaned_df.dropna(inplace=True)

# Reset the index of the cleaned DataFrame
cleaned_df.reset_index(drop=True, inplace=True)

#### same eda after cleaning

In [None]:
# Print the first few rows of the DataFrame
print("First 5 rows:")
print(cleaned_df.head())

# Get basic information about the DataFrame
print("\nDataFrame Info:")
print(cleaned_df.info())

# Summary statistics of numerical columns
print("\nSummary Statistics:")
print(cleaned_df.describe())

# Count the number of unique artists
num_artists = cleaned_df['artistName'].nunique()
print("\nNumber of Unique Artists:", num_artists)

# Count the number of unique tracks
num_tracks = cleaned_df['trackName'].nunique()
print("Number of Unique Tracks:", num_tracks)

# Total duration of listening in minutes
total_duration_min = cleaned_df['msPlayed'].sum() / (1000 * 60)
print("Total Duration (minutes):", total_duration_min)

# Top 5 most listened artists
top_artists = cleaned_df['artistName'].value_counts().head(5)
print("\nTop 5 Artists:")
print(top_artists)

# Top 5 most listened tracks
top_tracks = cleaned_df['trackName'].value_counts().head(5)
print("\nTop 5 Tracks:")
print(top_tracks)