## imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler

## read data

In [2]:
df = pd.read_csv("../data/interim/Telco-Customer-Churn_int.csv")

## data cleansing and create buckets

In [3]:
# Clean TotalCharges
df['TotalCharges'] = df["TotalCharges"].replace(" ",np.nan)
df.dropna(axis=0,inplace=True)
df.reset_index(drop=True, inplace=True)
df["TotalCharges"] = df["TotalCharges"].astype(float)

In [4]:
#replace 'No internet service' to No for the following columns
replace_cols = [ 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                'TechSupport','StreamingTV', 'StreamingMovies']
for col in replace_cols : 
    df[col]  = df[col].replace({'No internet service' : 'No'})
    
#replace values
df["SeniorCitizen"] = df["SeniorCitizen"].replace({1:"Yes",0:"No"})

## create useful python variables

In [5]:
id_col = ['customerID']

target_col = ["Churn"]

categorical_cols = df.nunique()[df.nunique() < 5].keys().tolist()
categorical_cols = [x for x in categorical_cols if x not in target_col]

numerical_cols = [x for x in df.columns if x not in categorical_cols + target_col + id_col]

binary_cols = df.nunique()[df.nunique() == 2].keys().tolist()

multi_cols = [i for i in categorical_cols if i not in binary_cols]

## Label encoding

In [6]:
le = LabelEncoder()
for col in binary_cols :
    df[col] = le.fit_transform(df[col])

## Create dummy variables

In [7]:
df = pd.get_dummies(data = df,columns = multi_cols)

## Scale numerical columns

In [8]:
scl = RobustScaler()
scaled = scl.fit_transform(df[numerical_cols])
scaled = pd.DataFrame(scaled,columns=numerical_cols)

In [11]:
df_original = df.copy()

df = df.drop(columns = numerical_cols,axis = 1)
df = df.merge(scaled,left_index=True,right_index=True,how = "left")