In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

datadir = "/home/ababil/BUET/4-2/CSE472/Datasets/"
filename = "WA_Fn-UseC_-Telco-Customer-Churn.csv"


df = pd.read_csv(datadir+filename)
# df.info()

# change total charges to numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# if there are NaN values, replace them with 0
for col in df.columns.values:
    if df[col].dtype == 'int64' or df[col].dtype == 'float64':
        df[col] = df[col].fillna(0)

# drop customerID
df.drop(['customerID'], axis=1, inplace=True)

# change the binary columns to 0 and 1
for col in df.columns.values:
    # if object type and only 2 unique values
    if len(df[col].unique()) == 2 and df[col].dtype == 'object':
        # if the first value is 'No' then change it to 0 and the second value to 1
        if df[col].unique()[0] == 'No':
            df[col] = df[col].map({df[col].unique()[0]:0, df[col].unique()[1]:1})
        else:
            df[col] = df[col].map({df[col].unique()[0]:1, df[col].unique()[1]:0})
            
# recognize outliers in monthly charges
for col in df.columns.values:
    if df[col].dtype == 'float64':
        z = np.abs(stats.zscore(df['TotalCharges']))
        threshold = 3
        outliers = np.where(z > threshold)
        if len(outliers[0]) > 0:
            df.drop(outliers[0], inplace=True)

# split into features and labels
features = df[df.columns.values[:-1]]
labels = df[df.columns.values[-1]]

encoded_features = pd.get_dummies(features)
# print(encoded_features.head())

# split into train, validation and test set
X_train, X_test, y_train, y_test = train_test_split(encoded_features, labels, test_size=0.2, random_state=77)   
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=77)


In [2]:
X_train

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,MultipleLines_No,...,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
544,0,0,0,0,13,0,1,45.55,597.00,0,...,1,0,0,1,0,0,0,1,0,0
1843,1,0,1,1,71,1,0,19.70,1415.85,1,...,0,1,0,0,0,1,0,1,0,0
3368,1,0,0,0,46,1,1,94.15,4408.45,0,...,0,0,1,0,1,0,0,0,1,0
5035,0,0,0,0,20,1,0,78.80,1641.30,1,...,1,0,0,1,0,0,0,0,1,0
2750,0,0,0,0,3,1,1,19.45,69.25,1,...,0,1,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6387,1,0,1,0,35,1,0,56.85,1861.10,1,...,1,0,0,1,0,0,0,0,1,0
4246,0,0,1,1,68,1,1,110.20,7467.50,0,...,0,0,1,0,0,1,0,0,1,0
6499,0,0,0,0,1,1,0,20.30,20.30,1,...,0,1,0,1,0,0,0,0,0,1
5631,0,0,1,0,3,0,0,25.00,78.25,0,...,1,0,0,1,0,0,0,0,0,1


In [3]:
# scale the features
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns.values)
X_train_scaled_df

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,MultipleLines_No,...,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,-1.008243,-0.443219,-0.948728,-0.649090,-0.779915,-2.973317,0.833665,-0.647670,-0.740576,-0.953806,...,1.224066,-0.515073,-0.799691,0.883583,-0.505057,-0.554539,-0.526777,1.933707,-0.719381,-0.547696
1,0.991824,-0.443219,1.054043,1.540618,1.580098,0.336325,-1.199522,-1.512434,-0.377451,1.048431,...,-0.816950,1.941472,-0.799691,-1.131755,-0.505057,1.803299,-0.526777,1.933707,-0.719381,-0.547696
2,0.991824,-0.443219,-0.948728,-0.649090,0.562851,0.336325,0.833665,0.978153,0.949639,-0.953806,...,-0.816950,-0.515073,1.250483,-1.131755,1.979976,-0.554539,-0.526777,-0.517141,1.390085,-0.547696
3,-1.008243,-0.443219,-0.948728,-0.649090,-0.495086,0.336325,-1.199522,0.464647,-0.277473,1.048431,...,1.224066,-0.515073,-0.799691,0.883583,-0.505057,-0.554539,-0.526777,-0.517141,1.390085,-0.547696
4,-1.008243,-0.443219,-0.948728,-0.649090,-1.186814,0.336325,0.833665,-1.520797,-0.974610,1.048431,...,-0.816950,1.941472,-0.799691,-1.131755,1.979976,-0.554539,-0.526777,-0.517141,-0.719381,1.825830
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4502,0.991824,-0.443219,1.054043,-0.649090,0.115262,0.336325,-1.199522,-0.269650,-0.180001,1.048431,...,1.224066,-0.515073,-0.799691,0.883583,-0.505057,-0.554539,-0.526777,-0.517141,1.390085,-0.547696
4503,-1.008243,-0.443219,1.054043,1.540618,1.458029,0.336325,0.833665,1.515075,2.306196,-0.953806,...,-0.816950,-0.515073,1.250483,-1.131755,-0.505057,1.803299,-0.526777,-0.517141,1.390085,-0.547696
4504,-1.008243,-0.443219,-0.948728,-0.649090,-1.268194,0.336325,-1.199522,-1.492362,-0.996317,1.048431,...,-0.816950,1.941472,-0.799691,0.883583,-0.505057,-0.554539,-0.526777,-0.517141,-0.719381,1.825830
4505,-1.008243,-0.443219,1.054043,-0.649090,-1.186814,-2.973317,-1.199522,-1.335132,-0.970619,-0.953806,...,1.224066,-0.515073,-0.799691,0.883583,-0.505057,-0.554539,-0.526777,-0.517141,-0.719381,1.825830


In [4]:
X_train_scaled_cols = X_train.copy()
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
X_train_scaled_cols[numeric_cols] = scaler.fit_transform(X_train_scaled_cols[numeric_cols])
X_train_scaled_cols

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,MultipleLines_No,...,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
544,0,0,0,0,-0.779915,0,1,-0.647670,-0.740576,0,...,1,0,0,1,0,0,0,1,0,0
1843,1,0,1,1,1.580098,1,0,-1.512434,-0.377451,1,...,0,1,0,0,0,1,0,1,0,0
3368,1,0,0,0,0.562851,1,1,0.978153,0.949639,0,...,0,0,1,0,1,0,0,0,1,0
5035,0,0,0,0,-0.495086,1,0,0.464647,-0.277473,1,...,1,0,0,1,0,0,0,0,1,0
2750,0,0,0,0,-1.186814,1,1,-1.520797,-0.974610,1,...,0,1,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6387,1,0,1,0,0.115262,1,0,-0.269650,-0.180001,1,...,1,0,0,1,0,0,0,0,1,0
4246,0,0,1,1,1.458029,1,1,1.515075,2.306196,0,...,0,0,1,0,0,1,0,0,1,0
6499,0,0,0,0,-1.268194,1,0,-1.492362,-0.996317,1,...,0,1,0,1,0,0,0,0,0,1
5631,0,0,1,0,-1.186814,0,0,-1.335132,-0.970619,0,...,1,0,0,1,0,0,0,0,0,1


In [3]:
from preprocessingTelco import preprocessAndSplit
df = pd.read_csv(datadir+filename)
X_train, X_test, y_train, y_test = preprocessAndSplit(df)
X_train

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,MultipleLines_No,...,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
6056,1,0,0,0,1.034513,1,1,0.210999,0.884968,0,...,1,0,0,0,1,0,0,1,0,0
2949,1,0,0,0,0.953134,1,1,-0.149398,0.436795,1,...,1,0,0,0,1,0,0,1,0,0
5208,1,0,1,1,-0.755825,1,0,-0.175971,-0.666340,1,...,1,0,0,1,0,0,1,0,0,0
2483,0,1,1,1,0.546239,1,1,1.194201,0.991670,0,...,1,0,0,1,0,0,0,0,1,0
1023,1,1,1,0,0.505550,1,1,0.709243,0.691955,1,...,0,0,1,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,1,0,1,1,0.953134,1,0,0.802249,1.179854,1,...,0,0,1,0,1,0,1,0,0,0
3496,0,0,0,0,-0.308241,1,0,0.317291,-0.149847,1,...,1,0,0,1,0,0,1,0,0,0
5509,1,0,0,1,-1.122031,1,0,-0.622730,-0.912879,1,...,1,0,0,1,0,0,0,0,0,1
3183,0,1,1,0,-1.203410,0,1,-0.939945,-0.969317,0,...,1,0,0,1,0,0,0,0,0,1
