In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTENC

In [2]:
DF_PATH = '../../data/spotify_churn_dataset.csv'

In [3]:
df = pd.read_csv(DF_PATH, index_col=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8000 entries, 1 to 8000
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   gender                 8000 non-null   object 
 1   age                    8000 non-null   int64  
 2   country                8000 non-null   object 
 3   subscription_type      8000 non-null   object 
 4   listening_time         8000 non-null   int64  
 5   songs_played_per_day   8000 non-null   int64  
 6   skip_rate              8000 non-null   float64
 7   device_type            8000 non-null   object 
 8   ads_listened_per_week  8000 non-null   int64  
 9   offline_listening      8000 non-null   int64  
 10  is_churned             8000 non-null   int64  
dtypes: float64(1), int64(6), object(4)
memory usage: 750.0+ KB


In [4]:
df['is_churned'].value_counts()

is_churned
0    5929
1    2071
Name: count, dtype: int64

In [5]:
# Defining train and test splits
X = df.drop('is_churned', axis=1)
y = df['is_churned']

X_train, X_test , y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [6]:
obj_columns = X_train.select_dtypes(include='object').columns
obj_columns
for col in obj_columns:
    print(f"{col}: {X[col].unique()}")

obj_cols_index = [X_train.columns.get_loc(col) for col in obj_columns]

gender: ['Female' 'Other' 'Male']
country: ['CA' 'DE' 'AU' 'US' 'UK' 'IN' 'FR' 'PK']
subscription_type: ['Free' 'Family' 'Premium' 'Student']
device_type: ['Desktop' 'Web' 'Mobile']


In [7]:
num_columns = X_train.select_dtypes(include=['int64', 'float64']).columns
num_columns

Index(['age', 'listening_time', 'songs_played_per_day', 'skip_rate',
       'ads_listened_per_week', 'offline_listening'],
      dtype='object')

In [8]:
# Data augmentation pipeline on training test
smotenc_module = SMOTENC(categorical_features=obj_cols_index, sampling_strategy='minority', random_state=42)
X_train_res, y_train_res = smotenc_module.fit_resample(X_train, y_train)

# Samples count check
print("Before SMOTENC: ", y_train.value_counts())
print("After SMOTENC: ", y_train_res.value_counts())

Before SMOTENC:  is_churned
0    4743
1    1657
Name: count, dtype: int64
After SMOTENC:  is_churned
0    4743
1    4743
Name: count, dtype: int64


In [9]:
# Defining a data transformation pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), obj_columns),
        ('num', MinMaxScaler(), num_columns)
    ]
)

In [10]:
# Applying transformations pipeline
X_train_transformed  = preprocessor.fit_transform(X_train_res)
X_test_transformed = preprocessor.transform(X_test)

# Rebuild train and test dataframes
encoded_columns = preprocessor.named_transformers_['cat'].get_feature_names_out(obj_columns)
all_features = list(encoded_columns) + list(num_columns)
X_train = pd.DataFrame(X_train_transformed, columns=all_features, index=X_train_res.index)
X_test = pd.DataFrame(X_test_transformed, columns=all_features, index=X_test.index)

In [13]:
# Saving tranformed dataframe
X_train.to_csv('../../data/splits/X_train.csv', index=True)
y_train_res.to_csv('../../data/splits/y_train.csv', index=True)

X_test.to_csv('../../data/splits/X_test.csv', index=True)
y_test.to_csv('../../data/splits/y_test.csv', index=True)