In [1]:
# Feature Engineering: imports
import warnings
from pathlib import Path

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", 200)

In [2]:
df= pd.read_csv("../data/raw/telecom_customer_churn.csv")
df_copy = df.copy()

In [3]:
df.head()

Unnamed: 0,Customer ID,Gender,Age,Married,Number of Dependents,City,Zip Code,Latitude,Longitude,Number of Referrals,Tenure in Months,Offer,Phone Service,Avg Monthly Long Distance Charges,Multiple Lines,Internet Service,Internet Type,Avg Monthly GB Download,Online Security,Online Backup,Device Protection Plan,Premium Tech Support,Streaming TV,Streaming Movies,Streaming Music,Unlimited Data,Contract,Paperless Billing,Payment Method,Monthly Charge,Total Charges,Total Refunds,Total Extra Data Charges,Total Long Distance Charges,Total Revenue,Customer Status,Churn Category,Churn Reason
0,0002-ORFBO,Female,37,Yes,0,Frazier Park,93225,34.827662,-118.999073,2,9,,Yes,42.39,No,Yes,Cable,16.0,No,Yes,No,Yes,Yes,No,No,Yes,One Year,Yes,Credit Card,65.6,593.3,0.0,0,381.51,974.81,Stayed,,
1,0003-MKNFE,Male,46,No,0,Glendale,91206,34.162515,-118.203869,0,9,,Yes,10.69,Yes,Yes,Cable,10.0,No,No,No,No,No,Yes,Yes,No,Month-to-Month,No,Credit Card,-4.0,542.4,38.33,10,96.21,610.28,Stayed,,
2,0004-TLHLJ,Male,50,No,0,Costa Mesa,92627,33.645672,-117.922613,0,4,Offer E,Yes,33.65,No,Yes,Fiber Optic,30.0,No,No,Yes,No,No,No,No,Yes,Month-to-Month,Yes,Bank Withdrawal,73.9,280.85,0.0,0,134.6,415.45,Churned,Competitor,Competitor had better devices
3,0011-IGKFF,Male,78,Yes,0,Martinez,94553,38.014457,-122.115432,1,13,Offer D,Yes,27.82,No,Yes,Fiber Optic,4.0,No,Yes,Yes,No,Yes,Yes,No,Yes,Month-to-Month,Yes,Bank Withdrawal,98.0,1237.85,0.0,0,361.66,1599.51,Churned,Dissatisfaction,Product dissatisfaction
4,0013-EXCHZ,Female,75,Yes,0,Camarillo,93010,34.227846,-119.079903,3,3,,Yes,7.38,No,Yes,Fiber Optic,11.0,No,No,No,Yes,Yes,No,No,Yes,Month-to-Month,Yes,Credit Card,83.9,267.4,0.0,0,22.14,289.54,Churned,Dissatisfaction,Network reliability


In [4]:
df = df.rename(lambda x: x.lower().strip().replace(' ', '_'), axis='columns')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 38 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   customer_id                        7043 non-null   object 
 1   gender                             7043 non-null   object 
 2   age                                7043 non-null   int64  
 3   married                            7043 non-null   object 
 4   number_of_dependents               7043 non-null   int64  
 5   city                               7043 non-null   object 
 6   zip_code                           7043 non-null   int64  
 7   latitude                           7043 non-null   float64
 8   longitude                          7043 non-null   float64
 9   number_of_referrals                7043 non-null   int64  
 10  tenure_in_months                   7043 non-null   int64  
 11  offer                              3166 non-null   objec

In [6]:
df.columns

Index(['customer_id', 'gender', 'age', 'married', 'number_of_dependents',
       'city', 'zip_code', 'latitude', 'longitude', 'number_of_referrals',
       'tenure_in_months', 'offer', 'phone_service',
       'avg_monthly_long_distance_charges', 'multiple_lines',
       'internet_service', 'internet_type', 'avg_monthly_gb_download',
       'online_security', 'online_backup', 'device_protection_plan',
       'premium_tech_support', 'streaming_tv', 'streaming_movies',
       'streaming_music', 'unlimited_data', 'contract', 'paperless_billing',
       'payment_method', 'monthly_charge', 'total_charges', 'total_refunds',
       'total_extra_data_charges', 'total_long_distance_charges',
       'total_revenue', 'customer_status', 'churn_category', 'churn_reason'],
      dtype='object')

In [7]:
cols_to_remove = ['customer_id','zip_code', 'latitude', 'longitude','total_charges']
reason = ['churn_category', 'churn_reason']

In [8]:
# df.drop(columns=['Number of Dependents',
#                   'Number of Referrals',
#                   'Total Refunds',
#                   'Total Extra Data Charges',
#                   'Total Long Distance Charges'],inplace=True)

In [9]:
df.drop(columns=cols_to_remove,inplace=True)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 33 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   gender                             7043 non-null   object 
 1   age                                7043 non-null   int64  
 2   married                            7043 non-null   object 
 3   number_of_dependents               7043 non-null   int64  
 4   city                               7043 non-null   object 
 5   number_of_referrals                7043 non-null   int64  
 6   tenure_in_months                   7043 non-null   int64  
 7   offer                              3166 non-null   object 
 8   phone_service                      7043 non-null   object 
 9   avg_monthly_long_distance_charges  6361 non-null   float64
 10  multiple_lines                     6361 non-null   object 
 11  internet_service                   7043 non-null   objec

In [11]:
df.isnull().sum()

gender                                  0
age                                     0
married                                 0
number_of_dependents                    0
city                                    0
number_of_referrals                     0
tenure_in_months                        0
offer                                3877
phone_service                           0
avg_monthly_long_distance_charges     682
multiple_lines                        682
internet_service                        0
internet_type                        1526
avg_monthly_gb_download              1526
online_security                      1526
online_backup                        1526
device_protection_plan               1526
premium_tech_support                 1526
streaming_tv                         1526
streaming_movies                     1526
streaming_music                      1526
unlimited_data                       1526
contract                                0
paperless_billing                 

In [12]:
df.drop_duplicates(inplace=True)

In [13]:
df.describe()

Unnamed: 0,age,number_of_dependents,number_of_referrals,tenure_in_months,avg_monthly_long_distance_charges,avg_monthly_gb_download,monthly_charge,total_refunds,total_extra_data_charges,total_long_distance_charges,total_revenue
count,7043.0,7043.0,7043.0,7043.0,6361.0,5517.0,7043.0,7043.0,7043.0,7043.0,7043.0
mean,46.509726,0.468692,1.951867,32.386767,25.420517,26.189958,63.596131,1.962182,6.860713,749.099262,3034.379056
std,16.750352,0.962802,3.001199,24.542061,14.200374,19.586585,31.204743,7.902614,25.104978,846.660055,2865.204542
min,19.0,0.0,0.0,1.0,1.01,2.0,-10.0,0.0,0.0,0.0,21.36
25%,32.0,0.0,0.0,9.0,13.05,13.0,30.4,0.0,0.0,70.545,605.61
50%,46.0,0.0,0.0,29.0,25.69,21.0,70.05,0.0,0.0,401.44,2108.64
75%,60.0,0.0,3.0,55.0,37.68,30.0,89.75,0.0,0.0,1191.1,4801.145
max,80.0,9.0,11.0,72.0,49.99,85.0,118.75,49.79,150.0,3564.72,11979.34


In [14]:
# filling null values
df['internet_type'] = df['internet_type'].apply(lambda x: 'no_internet_service' if pd.isnull(x) else x)
df['offer'] = df['offer'].apply(lambda x: 'no_offer' if pd.isnull(x) else x)

column_name = ['online_security', 'online_backup', 'device_protection_plan', 'premium_tech_support',
               'streaming_tv', 'streaming_movies', 'streaming_music', 'unlimited_data']
for column in column_name:
  df[column] = df[column].apply(lambda x: 'no_internet_service' if pd.isnull(x) else x)

In [15]:
df.isnull().sum()

gender                                  0
age                                     0
married                                 0
number_of_dependents                    0
city                                    0
number_of_referrals                     0
tenure_in_months                        0
offer                                   0
phone_service                           0
avg_monthly_long_distance_charges     682
multiple_lines                        682
internet_service                        0
internet_type                           0
avg_monthly_gb_download              1526
online_security                         0
online_backup                           0
device_protection_plan                  0
premium_tech_support                    0
streaming_tv                            0
streaming_movies                        0
streaming_music                         0
unlimited_data                          0
contract                                0
paperless_billing                 

In [16]:
column_name = ['avg_monthly_long_distance_charges', 'avg_monthly_gb_download']
for column in column_name:
  df[column] = df[column].apply(lambda x: 0 if pd.isnull(x) else x)

In [17]:
df.to_csv("../data/processed/fe_data.csv")