In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('C:\\Users\\bista\\Documents\\Downloads\\DataWave_Music_Sprint_Dataset.csv')
print(df.head())



  user_id  country  age gender subscription_type  \
0   U1403    Nepal   23      F            Premum   
1   U1165    Nepal   63   male           Student   
2   U1474      USA   45      F            Premum   
3   U1478  Nigeria   16      F            Family   
4   U1609    Ghana   29      M              Free   

   avg_listening_hours_per_week  total_songs_played skip_rate  \
0                          11.5                 510        5%   
1                          11.3                 526        5%   
2                           2.0                 476       30%   
3                          12.1                 529       20%   
4                           6.4                 477       0.2   

   satisfaction_score churned monthly_fee   join_date  
0                 2.0       0        7.99  03/21/2024  
1                 3.0       0           0  08/10/2023  
2                 3.0       0        7.99  04/05/2022  
3                 5.0     yes       12.99  12/16/2023  
4               

In [8]:
# Standardise column names
df.columns = (
    df.columns.str.strip()
    .str.lower()
    .str.replace(" ", "_")
)

In [9]:
df['subscription_type'] = df['subscription_type'].str.lower().str.strip()

df['subscription_type'] = df['subscription_type'].replace({
    'premum': 'premium',
    'premiun': 'premium',
    'prem': 'premium'
    
})


In [10]:
# 4. CLEAN GENDER COLUMN
df['gender'] = df['gender'].str.lower().str.strip()
df['gender'] = df['gender'].replace({
    'f' : 'female',
    'm' : 'male',
})

df['gender'] = df['gender'].fillna('unknown')


In [None]:
# 5. CLEAN NUMERIC COLUMNS

# Correct numeric columns based on your dataset
cols_numeric = [
    'age',
    'avg_listening_hours_per_week',
    'total_songs_played',
    'skip_rate',
    'satisfaction_score',
    'monthly_fee'
]

print(df[cols_numeric].head())


for col in cols_numeric:
    df[col] = pd.to_numeric(df[col], errors='coerce')



   age  avg_listening_hours_per_week  total_songs_played skip_rate  \
0   23                          11.5                 510        5%   
1   63                          11.3                 526        5%   
2   45                           2.0                 476       30%   
3   16                          12.1                 529       20%   
4   29                           6.4                 477       0.2   

   satisfaction_score monthly_fee  
0                 2.0        7.99  
1                 3.0           0  
2                 3.0        7.99  
3                 5.0       12.99  
4                 3.0        7.99  


In [None]:
# 6. FIX SKIP RATE COLUMN

df['skip_rate_pct'] = (
    df['skip_rate']
    .astype(str)
    .str.replace("%", "")
    .astype(float)
)


In [None]:
 # 7. CONVERT CHURNED COLUMN

# Use the correct column name
df['churned'] = df['churned'].astype(str).str.lower().str.strip()

df['churned'] = df['churned'].replace({
    'yes': 1,
    'no': 0,
    '1': 1,
    '0': 0
}).astype(int)

# Check if conversion worked
print(df['churned'].unique())


[0 1]


  df['churned'] = df['churned'].replace({


In [None]:
# 8. FIX JOIN_DATE COLUMN

# Convert join_date to datetime
df['join_date'] = pd.to_datetime(df['join_date'], errors='coerce')

# Create tenure in days
today = pd.Timestamp("2025-11-24")
df['tenure_days'] = (today - df['join_date']).dt.days

# Check the first few rows
print(df[['join_date', 'tenure_days']].head())


   join_date  tenure_days
0 2024-03-21        613.0
1 2023-08-10        837.0
2 2022-04-05       1329.0
3 2023-12-16        709.0
4 2022-03-28       1337.0


In [None]:
# 9 HANDLE MISSING VALUES

df.fillna({
    'gender': 'unknown',
    'subscription_type': 'unknown'
}, inplace=True)

# Make sure cols_numeric includes the correct numeric columns
cols_numeric = [
    'age',
    'avg_listening_hours_per_week',
    'total_songs_played',
    'skip_rate',
    'satisfaction_score',
    'monthly_fee',
    'tenure_days'
]

for col in cols_numeric:
    df[col] = df[col].fillna(df[col].median())


# 10. REMOVE OUTLIERS

df = df[(df['age'] >= 10) & (df['age'] <= 90)]
df = df[(df['avg_listening_hours_per_week'] >= 0) & (df['avg_listening_hours_per_week'] <= 200)]

print(df.head())



  user_id  country  age  gender subscription_type  \
0   U1403    Nepal   23  female           premium   
1   U1165    Nepal   63    male           student   
2   U1474      USA   45  female           premium   
3   U1478  Nigeria   16  female            family   
4   U1609    Ghana   29    male              free   

   avg_listening_hours_per_week  total_songs_played  skip_rate  \
0                          11.5                 510        0.2   
1                          11.3                 526        0.2   
2                           2.0                 476        0.2   
3                          12.1                 529        0.2   
4                           6.4                 477        0.2   

   satisfaction_score  churned  monthly_fee  join_date  skip_rate_pct  \
0                 2.0        0         7.99 2024-03-21            NaN   
1                 3.0        0         0.00 2023-08-10            NaN   
2                 3.0        0         7.99 2022-04-05           

In [16]:
# -------------------------------
# 11. BASIC ANALYSIS
# -------------------------------

print("Overall churn rate:", df['churned'].mean())

print("\nChurn rate by subscription type:")
print(df.groupby('subscription_type')['churned'].mean().sort_values(ascending=False))

print("\nChurn vs Satisfaction:")
print(df[['churned', 'satisfaction_score']].groupby('churned').mean())

print("\nChurn vs Listening hours:")
print(df[['churned', 'avg_listening_hours_per_week']].groupby('churned').mean())

print("\nChurn vs Skip Rate:")
# Use 'skip_rate' if 'skip_rate_pct' does not exist
if 'skip_rate_pct' in df.columns:
    print(df[['churned', 'skip_rate_pct']].groupby('churned').mean())
else:
    print(df[['churned', 'skip_rate']].groupby('churned').mean())

# -------------------------------
# 12. SAVE CLEANED FILE
# -------------------------------

df.to_csv("datawave_music_cleaned.csv", index=False)
print("\nCleaned dataset saved as: datawave_music_cleaned.csv")


Overall churn rate: 0.30354609929078014

Churn rate by subscription type:
subscription_type
fam        0.387755
student    0.311321
premium    0.308511
studnt     0.282828
family     0.279279
free       0.252427
Name: churned, dtype: float64

Churn vs Satisfaction:
         satisfaction_score
churned                    
0                  3.107943
1                  3.172897

Churn vs Listening hours:
         avg_listening_hours_per_week
churned                              
0                           10.137475
1                            9.750467

Churn vs Skip Rate:
         skip_rate_pct
churned               
0             0.159223
1             0.142857

Cleaned dataset saved as: datawave_music_cleaned.csv
