In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('../data/spotify_churn_dataset.csv', index_col=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8000 entries, 1 to 8000
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   gender                 8000 non-null   object 
 1   age                    8000 non-null   int64  
 2   country                8000 non-null   object 
 3   subscription_type      8000 non-null   object 
 4   listening_time         8000 non-null   int64  
 5   songs_played_per_day   8000 non-null   int64  
 6   skip_rate              8000 non-null   float64
 7   device_type            8000 non-null   object 
 8   ads_listened_per_week  8000 non-null   int64  
 9   offline_listening      8000 non-null   int64  
 10  is_churned             8000 non-null   int64  
dtypes: float64(1), int64(6), object(4)
memory usage: 750.0+ KB


In [3]:
df.head()

Unnamed: 0_level_0,gender,age,country,subscription_type,listening_time,songs_played_per_day,skip_rate,device_type,ads_listened_per_week,offline_listening,is_churned
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,Female,54,CA,Free,26,23,0.2,Desktop,31,0,1
2,Other,33,DE,Family,141,62,0.34,Web,0,1,0
3,Male,38,AU,Premium,199,38,0.04,Mobile,0,1,1
4,Female,22,CA,Student,36,2,0.31,Mobile,0,1,0
5,Other,29,US,Family,250,57,0.36,Mobile,0,1,1


In [4]:
X = df.drop('is_churned', axis=1)
y = df['is_churned']

In [5]:
obj_columns = X.select_dtypes(include='object').columns
obj_columns
for col in obj_columns:
    print(f"{col}: {X[col].unique()}")

gender: ['Female' 'Other' 'Male']
country: ['CA' 'DE' 'AU' 'US' 'UK' 'IN' 'FR' 'PK']
subscription_type: ['Free' 'Family' 'Premium' 'Student']
device_type: ['Desktop' 'Web' 'Mobile']


In [6]:
num_columns = X.select_dtypes(include=['int64', 'float64']).columns
num_columns

Index(['age', 'listening_time', 'songs_played_per_day', 'skip_rate',
       'ads_listened_per_week', 'offline_listening'],
      dtype='object')

In [7]:
# Defining a data transformation pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), obj_columns),
        ('num', MinMaxScaler(), num_columns)
    ]
)

In [8]:
df_tranformed = preprocessor.fit_transform(df)
encoded_columns = preprocessor.named_transformers_['cat'].get_feature_names_out(obj_columns)
all_features = list(encoded_columns) + list(num_columns)
df_tranformed = pd.DataFrame(df_tranformed, columns=all_features, index=X.index)
df_tranformed.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8000 entries, 1 to 8000
Data columns (total 24 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   gender_Female              8000 non-null   float64
 1   gender_Male                8000 non-null   float64
 2   gender_Other               8000 non-null   float64
 3   country_AU                 8000 non-null   float64
 4   country_CA                 8000 non-null   float64
 5   country_DE                 8000 non-null   float64
 6   country_FR                 8000 non-null   float64
 7   country_IN                 8000 non-null   float64
 8   country_PK                 8000 non-null   float64
 9   country_UK                 8000 non-null   float64
 10  country_US                 8000 non-null   float64
 11  subscription_type_Family   8000 non-null   float64
 12  subscription_type_Free     8000 non-null   float64
 13  subscription_type_Premium  8000 non-null   float64
 1

In [11]:
df_tranformed['is_churned'] = y
df_tranformed.head()

Unnamed: 0_level_0,gender_Female,gender_Male,gender_Other,country_AU,country_CA,country_DE,country_FR,country_IN,country_PK,country_UK,...,device_type_Desktop,device_type_Mobile,device_type_Web,age,listening_time,songs_played_per_day,skip_rate,ads_listened_per_week,offline_listening,is_churned
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.883721,0.055363,0.22449,0.333333,0.632653,0.0,1
2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.395349,0.453287,0.622449,0.566667,0.0,1.0,0
3,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.511628,0.653979,0.377551,0.066667,0.0,1.0,1
4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.139535,0.089965,0.010204,0.516667,0.0,1.0,0
5,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.302326,0.83045,0.571429,0.6,0.0,1.0,1


In [12]:
# Saving tranformed dataframe
df_tranformed.to_csv('../data/spotify_churn_dataset_tranformed.csv', index=True)