In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import os

In [2]:
df = pd.read_csv("spotify_churn_dataset.csv")

print("=== DATA AWAL ===")
print("Jumlah baris dan kolom:", df.shape)
print(df.head())

=== DATA AWAL ===
Jumlah baris dan kolom: (8000, 12)
   user_id  gender  age country subscription_type  listening_time  \
0        1  Female   54      CA              Free              26   
1        2   Other   33      DE            Family             141   
2        3    Male   38      AU           Premium             199   
3        4  Female   22      CA           Student              36   
4        5   Other   29      US            Family             250   

   songs_played_per_day  skip_rate device_type  ads_listened_per_week  \
0                    23       0.20     Desktop                     31   
1                    62       0.34         Web                      0   
2                    38       0.04      Mobile                      0   
3                     2       0.31      Mobile                      0   
4                    57       0.36      Mobile                      0   

   offline_listening  is_churned  
0                  0           1  
1                  1   

In [3]:
print("\n=== CEK MISSING VALUE ===")
print(df.isnull().sum())


=== CEK MISSING VALUE ===
user_id                  0
gender                   0
age                      0
country                  0
subscription_type        0
listening_time           0
songs_played_per_day     0
skip_rate                0
device_type              0
ads_listened_per_week    0
offline_listening        0
is_churned               0
dtype: int64


In [6]:
num_cols = df.select_dtypes(include=[np.number]).columns
cat_cols = df.select_dtypes(exclude=[np.number]).columns
for col in num_cols:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].median(), inplace=True)

for col in cat_cols:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].mode()[0], inplace=True)

In [7]:
print("\n=== DUPLICATE DATA ===")
print("Jumlah data duplikat:", df.duplicated().sum())
df.drop_duplicates(inplace=True)


=== DUPLICATE DATA ===
Jumlah data duplikat: 0


In [8]:
print("\n=== HANDLING OUTLIER ===")
for col in num_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df[col] = np.where(df[col] < lower, lower, df[col])
    df[col] = np.where(df[col] > upper, upper, df[col])

print("Selesai menangani outlier.")


=== HANDLING OUTLIER ===
Selesai menangani outlier.


In [9]:
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

print("\n=== DATASET SETELAH STANDARDISASI ===")
print(df[num_cols].head())


=== DATASET SETELAH STANDARDISASI ===
    user_id       age  listening_time  songs_played_per_day  skip_rate  \
0 -1.731834  1.282452       -1.524434             -0.953574  -0.576827   
1 -1.731401 -0.365956       -0.155555              0.417349   0.229702   
2 -1.730968  0.026522        0.534836             -0.426296  -1.498575   
3 -1.730535 -1.229408       -1.405401             -1.691763   0.056875   
4 -1.730102 -0.679939        1.141904              0.241590   0.344921   

   ads_listened_per_week  offline_listening  is_churned  
0               1.821693          -1.721720    1.692001  
1              -0.572735           0.580814   -0.591016  
2              -0.572735           0.580814    1.692001  
3              -0.572735           0.580814   -0.591016  
4              -0.572735           0.580814    1.692001  


In [10]:
label_encoders = {}

for col in cat_cols:
    if df[col].nunique() <= 10:
        df = pd.get_dummies(df, columns=[col], drop_first=True)
    else:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le

print("\n=== DATASET SETELAH ENCODING ===")
print(df.head())


=== DATASET SETELAH ENCODING ===
    user_id       age  listening_time  songs_played_per_day  skip_rate  \
0 -1.731834  1.282452       -1.524434             -0.953574  -0.576827   
1 -1.731401 -0.365956       -0.155555              0.417349   0.229702   
2 -1.730968  0.026522        0.534836             -0.426296  -1.498575   
3 -1.730535 -1.229408       -1.405401             -1.691763   0.056875   
4 -1.730102 -0.679939        1.141904              0.241590   0.344921   

   ads_listened_per_week  offline_listening  is_churned  gender_Male  \
0               1.821693          -1.721720    1.692001        False   
1              -0.572735           0.580814   -0.591016        False   
2              -0.572735           0.580814    1.692001         True   
3              -0.572735           0.580814   -0.591016        False   
4              -0.572735           0.580814    1.692001        False   

   gender_Other  ...  country_FR  country_IN  country_PK  country_UK  \
0         False 

In [11]:
if {'songs_played_per_day', 'listening_time'}.issubset(df.columns):
    df['efficiency_ratio'] = df['songs_played_per_day'] / (df['listening_time'] + 1)
    print("\nFitur baru 'efficiency_ratio' berhasil ditambahkan.")
else:
    # Jika kolom tersebut tidak ada, buat fitur interaksi sederhana
    df['interaction_feat'] = df[num_cols[0]] * df[num_cols[1]]
    print("\nFitur baru 'interaction_feat' berhasil ditambahkan.")


Fitur baru 'efficiency_ratio' berhasil ditambahkan.


In [12]:
target_col = 'is_churned' if 'is_churned' in df.columns else df.columns[-1]

In [13]:
X = df.drop(columns=[target_col])
y = df[target_col]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("\n=== HASIL SPLIT ===")
print("Data Train:", X_train.shape)
print("Data Test :", X_test.shape)



=== HASIL SPLIT ===
Data Train: (6400, 22)
Data Test : (1600, 22)


In [14]:
train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

In [15]:
output_dir = "processed_data"
os.makedirs(output_dir, exist_ok=True)

train_data.to_csv(f"{output_dir}/train_processed.csv", index=False)
test_data.to_csv(f"{output_dir}/test_processed.csv", index=False)
df.to_csv(f"{output_dir}/full_processed.csv", index=False)

print("\nFile hasil preprocessing tersimpan di folder:", output_dir)
print("Berikut file yang disimpan:")
print("- train_processed.csv")
print("- test_processed.csv")
print("- full_processed.csv")


File hasil preprocessing tersimpan di folder: processed_data
Berikut file yang disimpan:
- train_processed.csv
- test_processed.csv
- full_processed.csv
