In [28]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [2]:
df = pd.read_csv("../data/cleaned/cleaned.csv")

In [3]:
numerical_features = df.select_dtypes("number").columns
categorical_features = df.select_dtypes("object").columns

In [4]:
df.head()

Unnamed: 0,age,gender,tenure,usage_frequency,support_calls,payment_delay,subscription_type,contract_length,total_spend,last_interaction,churn
0,30,Female,39,14,5,18,Standard,Annual,932.0,17,1
1,65,Female,49,1,10,8,Basic,Monthly,557.0,6,1
2,55,Female,14,4,6,18,Basic,Quarterly,185.0,3,1
3,58,Male,38,21,7,7,Standard,Monthly,396.0,29,1
4,23,Male,32,20,5,8,Basic,Monthly,617.0,20,1


### ***Label Encoding***

In [5]:
# for col in categorical_features:
#     label_encoder = LabelEncoder()
#     df[col] = label_encoder.fit_transform(df[col])

In [6]:
df.head()

Unnamed: 0,age,gender,tenure,usage_frequency,support_calls,payment_delay,subscription_type,contract_length,total_spend,last_interaction,churn
0,30,Female,39,14,5,18,Standard,Annual,932.0,17,1
1,65,Female,49,1,10,8,Basic,Monthly,557.0,6,1
2,55,Female,14,4,6,18,Basic,Quarterly,185.0,3,1
3,58,Male,38,21,7,7,Standard,Monthly,396.0,29,1
4,23,Male,32,20,5,8,Basic,Monthly,617.0,20,1


In [7]:
df["subscription_type"].value_counts()

subscription_type
Standard    170630
Premium     170099
Basic       164477
Name: count, dtype: int64

In [8]:
df["contract_length"].value_counts()

contract_length
Annual       198608
Quarterly    197364
Monthly      109234
Name: count, dtype: int64

In [9]:
df["gender"].value_counts()

gender
Male      280273
Female    224933
Name: count, dtype: int64

In [30]:
def transform_data() -> bool:
    categorical_pipeline = Pipeline(
        steps=[("encoder", OrdinalEncoder()), ("scaler", StandardScaler())]
    )

    numerical_pipeline = Pipeline(steps=[("scaler", StandardScaler())])

    preprocessor = ColumnTransformer(
        transformers=[
            ("numerical", numerical_pipeline, numerical_features),
            ("categorical", categorical_pipeline, categorical_features),
        ]
    )

    preprocessed_df = preprocessor.fit_transform(df)
    return preprocessed_df

In [31]:
new_data = transform_data()

In [33]:
new_df = pd.DataFrame(new_data, columns=df.columns)

In [34]:
new_df.head()

Unnamed: 0,age,gender,tenure,usage_frequency,support_calls,payment_delay,subscription_type,contract_length,total_spend,last_interaction,churn
0,-0.765883,0.443776,-0.198951,0.372314,0.532844,1.271517,0.277572,0.895066,-1.116257,1.213024,-1.126764
1,1.996425,1.023907,-1.707192,1.967923,-0.650423,-0.257105,-1.000267,0.895066,-1.116257,-1.242935,0.002781
2,1.207194,-1.006554,-1.359137,0.691436,0.532844,-1.773498,-1.348769,0.895066,-1.116257,-1.242935,1.132327
3,1.443963,0.385762,0.613178,1.010558,-0.76875,-0.913393,1.671579,0.895066,0.895851,1.213024,0.002781
4,-1.318345,0.037683,0.49716,0.372314,-0.650423,-0.012526,0.626074,0.895066,0.895851,-1.242935,0.002781
