## Churn - Import


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import display, Markdown
plt.style.use("seaborn-darkgrid")
pd.set_option('display.max_columns', None)  

import sys, os, yaml

DATASET = "Churn"
COLAB = 'google.colab' in sys.modules
DEBUG = False
SEED = 666

##Load Dataset

In [4]:
if COLAB:
  from google.colab import drive
  if not os.path.isdir("/content/gdrive"):
    drive.mount("/content/gdrive")
    d = "/content/gdrive/MyDrive/datasets"
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
  ROOT = f"/content/gdrive/MyDrive/datasets/{DATASET.replace(' ','_')}/"
  if not os.path.isdir(ROOT): os.makedirs(ROOT)
else:
  ROOT = "./"

def makedirs(d):
  if COLAB:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
  else:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d, mode=0o777, exist_ok=True)

for d in ['orig','data','output']: makedirs(d)

In [5]:
BASE_URL = "https://SETU-DataMining2.github.io/live/resources/churn"

for filename in ['data.csv','datasheet.yaml']:
    source = f"{BASE_URL}/{filename}"
    target = f"{ROOT}/orig/{filename}"

    if not os.path.isfile(target):
        print (f"Downloading remote file {filename}", sep="")
        import urllib.request
        urllib.request.urlretrieve(source, target)
    else:
        print(f"Using local copy of {filename}")

Using local copy of data.csv
Using local copy of datasheet.yaml


In [6]:
df = pd.read_csv(f"{ROOT}orig/data.csv")
print(df.shape)
df.head

(7043, 21)


<bound method NDFrame.head of       customerID  gender  SeniorCitizen Partner Dependents  tenure  \
0              1  Female              0     Yes         No       1   
1              2    Male              0      No         No      34   
2              3    Male              0      No         No       2   
3              4    Male              0      No         No      45   
4              5  Female              0      No         No       2   
...          ...     ...            ...     ...        ...     ...   
7038        7039    Male              0     Yes        Yes      24   
7039        7040  Female              0     Yes        Yes      72   
7040        7041  Female              0     Yes        Yes      11   
7041        7042    Male              1     Yes         No       4   
7042        7043    Male              0      No         No      66   

     PhoneService MultipleLines InternetService OnlineSecurity OnlineBackup  \
0              No            No             DSL   

In [7]:
yaml.safe_load(open(f"{ROOT}/orig/datasheet.yaml").read().replace("\t", " "))

{'customerID': '1 to 7044',
 'gender': '0 - female, 1 - male',
 'SeniorCitizen': '0 - no, 1 - yes',
 'Partner': '0 - no, 1 - yes',
 'Dependents': '0 - no, 1 - yes',
 'tenure': 'numerical',
 'PhoneService': '0 - no, 1 - yes',
 'MultipleLines': '0 - no, 1 - yes',
 'InternetService Factor': 'DSL, Fiber optic, No',
 'OnlineSecurity': '0 - no, 1 - yes',
 'OnlineBackup': '0 - no, 1 - yes',
 'DeviceProtection': '0 - no, 1 - yes',
 'TechSupport': '0 - no, 1 - yes',
 'StreamingTV': '0 - no, 1 - yes',
 'StreamingMovies': '0 - no, 1 - yes',
 'Contract Factor': 'Month-to-month, One year, Two year',
 'PaperlessBilling': '0 - no, 1 - yes',
 'PaymentMethod Factor': 'Electronic check, Mailed check, Bank transfer (automatic), Credit card (automatic)',
 'MonthlyCharges': 'numerical',
 'TotalCharges': 'numerical',
 'Churn': '0 - no, 1 - yes'}

##Cleaning

In [8]:
df.columns = [c[0].upper()+c[1:] for c in df.columns]

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   CustomerID        7043 non-null   int64  
 1   Gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   Tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [10]:
df.TotalCharges = pd.to_numeric(df.TotalCharges, errors='coerce')
df.TotalCharges.isna().sum()

11

In [11]:
if df.SeniorCitizen.dtype == int:
  df.SeniorCitizen = df.SeniorCitizen.map({0:"No", 1:"Yes"})
df.SeniorCitizen.head(2)

0    No
1    No
Name: SeniorCitizen, dtype: object

In [12]:
for c in df.select_dtypes("object").columns:
  df[c] = pd.Categorical(df[c])

In [13]:
df.Contract = pd.Categorical(df.Contract, categories = ['Month-to-month', 'One year', 'Two year'], ordered=True)

##Remove NA

Fix 'TotalCharges' to be numeric. Results  missing values. Since number is small, just drop these rows.

In [14]:
df.dropna(inplace=True)
print(df.shape)

(7032, 21)


##Save Dataset

In [15]:
df.to_pickle(f"{ROOT}/data/data.pkl")