# Fetching Telco Customer Churn Dataset from Kaggle

In [1]:
import warnings
import joblib
from datetime import datetime
from pathlib import Path
import subprocess
import os
import zipfile

import pandas as pd

%load_ext autoreload
%autoreload 2

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Various settings
warnings.filterwarnings("ignore")
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_colwidth", 40)
pd.set_option("display.precision", 4)
pd.set_option("display.max_columns", None)

## Experimenting with Python Tools

In [2]:
# Let's define useful paths

In [3]:
from churn_detection.paths import EXTERNAL_DATA_DIR

In [4]:
CURRENT_DIR = Path().cwd()

In [5]:
# Let's download and unzip the dataset using Kaggle package

In [6]:
subprocess.run(['kaggle', 'datasets', 'download', '-d', 'blastchar/telco-customer-churn'])

CompletedProcess(args=['kaggle', 'datasets', 'download', '-d', 'blastchar/telco-customer-churn'], returncode=0)

In [7]:
ZIP_FILE = Path("telco-customer-churn.zip")

In [8]:
with zipfile.ZipFile(ZIP_FILE, 'r') as zip_ref:
    zip_ref.extractall(CURRENT_DIR)

In [9]:
CSV_FILE = Path("WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [10]:
# Let's read the extracted CSV file
data = pd.read_csv(CSV_FILE)  
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [11]:
# Let's convert it to feather format into data directory
data.to_feather(EXTERNAL_DATA_DIR / "customer_churn.feather")

In [12]:
# Let's remove the downloaded files
subprocess.run(['cmd', '/c', 'del', str(ZIP_FILE)], check=True)

CompletedProcess(args=['cmd', '/c', 'del', 'telco-customer-churn.zip'], returncode=0)

In [13]:
subprocess.run(['cmd', '/c', 'del', str(CSV_FILE)], check=True)

CompletedProcess(args=['cmd', '/c', 'del', 'WA_Fn-UseC_-Telco-Customer-Churn.csv'], returncode=0)

## Refactoring as Modular Code

In [14]:
from churn_detection import data

In [15]:
kaggle_target_dataset = "blastchar/telco-customer-churn"
raw_dataset = "WA_Fn-UseC_-Telco-Customer-Churn.csv"

In [16]:
churn_data = data.fetch_batch_data(
    kaggle_target_dataset, 
    cwd_path=CURRENT_DIR, 
    zip_file=ZIP_FILE, 
    raw_data=raw_dataset
)

In [17]:
churn_data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [18]:
data.save_batch_data(
    churn_data, 
    target_path=EXTERNAL_DATA_DIR, 
    zip_file=ZIP_FILE, 
    raw_data=raw_dataset
)