##  Milestone 1: EDA and Preprocessing

This notebook loads `data/raw/churn-bigml.csv`, performs exploratory data analysis, and applies preprocessing steps. The cleaned data is saved to `data/processed/churn_cleaned.csv`.

### Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")

### Loading the Dataset

In [None]:
df1 = pd.read_csv(r"../data/raw/churn-bigml-80.csv")
df2 = pd.read_csv(r"../data/raw/churn-bigml-20.csv")
data = pd.concat([df1, df2],axis=0)

### Exploring the Dataset

In [None]:
print("Shape of data:", data.shape)
print("Null values per column: ",sum(data.isnull().sum()))
print("Duplicate rows:", data.duplicated().sum())

In [None]:
print("\nInfo: ", data.info())
print("\nDescribe: ")
display(data.describe())

In [None]:
data.columns

In [None]:
print("\nHead:")
display(data.head())

In [None]:
data['Churn'].value_counts(normalize=True).rename('ratio')

### Data Cleaning

In [None]:
data[["International plan","Voice mail plan"]] = data[["International plan","Voice mail plan"]].replace("Yes",1).replace("No",0)
data['Churn'] = data['Churn'].replace({True:1,False:0})

In [None]:
from us import states
def state_to_full(state):
    st = states.lookup(state)
    return st.name if st else state
data["State"] = data["State"].apply(state_to_full)
data["State"].replace("DC","District of Columbia",inplace=True)

In [None]:
numerical_cols = data.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = data.select_dtypes(exclude=[np.number]).columns.tolist()

In [None]:
px.box(data_frame=data[numerical_cols])

In [None]:
data.shape

### Data Quality Checks

In [None]:
outliers_columns = ["Account length","Total day minutes","Total eve minutes","Total night minutes","Total eve calls","Total night calls"]
for column in outliers_columns:
     q75,q25 = np.percentile(data.loc[:,column],[75,25])
     intr_qr = q75-q25
     max = q75+(1.5*intr_qr)
     min = q25-(1.5*intr_qr)
     data.loc[data[column] < min,column] = np.nan
     data.loc[data[column] > max,column] = np.nan
data = data.dropna(axis = 0)
data.reset_index(inplace=True)
data.drop(['index'],axis = 1,inplace=True)

In [None]:
data.shape

In [None]:
px.box(data_frame=data[outliers_columns])

In [None]:
for column in data.columns:
    if data[column].dtypes == "float64":
        data[column] = data[column].astype("int64")
data.info()

In [None]:
data.head()

In [None]:
processed = data.copy()
processed['State'] = processed['State'].astype('category')
processed.to_csv(r"../data/processed/churn_cleaned.csv",index=False)
processed.info()

### Data Visualization

In [None]:
fig, axs = plt.subplots(1,2,figsize=(12,6))
sns.countplot(x=data["Churn"], hue= data["Churn"],ax=axs[0])
axs[1].pie(data["Churn"].value_counts(), labels= ["Not Churn", "Churn"], autopct='%1.1f%%')
plt.show()

In [None]:
fig,axs = plt.subplots(4,2,figsize=(12,20))
axs[0,0].set_title("International plan")
sns.countplot(x="International plan",data=data,ax=axs[0,0],palette="pastel")
axs[0,1].set_title("Pie chart of International plan")
axs[0,1].pie(data["International plan"].value_counts(), labels= ["No", "Yes"], autopct='%1.1f%%')
axs[1,0].set_title("Voice mail plan")
sns.countplot(x="Voice mail plan",data=data,ax=axs[1,0],palette="pastel")
axs[1,1].set_title("Pie chart of Voice mail plan")
axs[1,1].pie(data["Voice mail plan"].value_counts(), labels= ["No", "Yes"], autopct='%1.1f%%')
axs[2,0].set_title("Customer service calls")
sns.countplot(x="Customer service calls",data=data,ax=axs[2,0],palette="pastel")
axs[2,1].set_title("Pie chart of Customer service calls")
axs[2,1].pie(data["Customer service calls"].value_counts().head(6),labels=data["Customer service calls"].value_counts().head(6).index,autopct='%1.1f%%')
axs[3,0].set_title("Area code")
sns.countplot(x="Area code",data=data,ax=axs[3,0],palette="pastel")
axs[3,1].set_title("Pie chart of Area code")
axs[3,1].pie(data["Area code"].value_counts(), labels= data["Area code"].value_counts().index, autopct='%1.1f%%')
plt.show()

In [None]:
fig, axs = plt.subplots(2,2,figsize=(12,8))
sns.countplot(x="International plan",data=data,hue="Churn",ax=axs[0,0])
sns.countplot(x="Voice mail plan",data=data,hue="Churn",ax=axs[0,1])
sns.countplot(x="Customer service calls",data=data,hue="Churn",ax=axs[1,0])
sns.countplot(x="Area code",data=data,hue="Churn",ax=axs[1,1])

In [None]:
sns.histplot(data=data,x="Account length",kde=True,hue="Churn")

In [None]:
# Total minutes across day/eve/night
fig, axes = plt.subplots(1,3, figsize=(15,5))
sns.histplot(data=data,x=data['Total day minutes'], ax=axes[0], kde=True,hue="Churn")
axes[0].set_title("Total Day Minutes")
sns.histplot(data=data,x=data['Total eve minutes'], ax=axes[1], kde=True,hue="Churn")
axes[1].set_title("Total Eve Minutes")
sns.histplot(data=data,x=data['Total night minutes'], ax=axes[2], kde=True,hue="Churn")
axes[2].set_title("Total Night Minutes")
plt.show()

# Total calls across day/eve/night
fig, axes = plt.subplots(1,3, figsize=(15,5))
sns.histplot(data=data,x=data['Total day calls'], ax=axes[0], kde=True,hue="Churn")
axes[0].set_title("Total Day Calls")
sns.histplot(data=data,x=data['Total eve calls'], ax=axes[1], kde=True,hue="Churn")
axes[1].set_title("Total Eve Calls")
sns.histplot(data=data,x=data['Total night calls'], ax=axes[2], kde=True,hue="Churn")
axes[2].set_title("Total Night Calls")
plt.show()

# Total charges across day/eve/night
fig, axes = plt.subplots(1,3, figsize=(15,5))
sns.histplot(data=data,x=data['Total day charge'], ax=axes[0], kde=True,hue="Churn")
axes[0].set_title("Total Day Charge")
sns.histplot(data=data,x=data['Total eve charge'], ax=axes[1], kde=True,hue="Churn")
axes[1].set_title("Total Eve Charge")
sns.histplot(data=data,x=data['Total night charge'], ax=axes[2], kde=True,hue="Churn")
axes[2].set_title("Total Night Charge")
plt.show()

In [None]:
data["Total national minutes"] = data["Total day minutes"] + data["Total eve minutes"] + data["Total night minutes"]
data["Total national charge"] = data["Total day charge"] + data["Total eve charge"] + data["Total night charge"]
px.scatter(data_frame=data,x="Total national charge",y="Total national minutes",color="Churn")

In [None]:
filtered_data_churn = data[data["Churn"] == 1]
filtered_data_not_churn = data[data["Churn"] == 0]
top_states_churn = filtered_data_churn["State"].value_counts().head(5).reset_index()
top_states_not_churn = filtered_data_not_churn["State"].value_counts().head(5).reset_index()
fig, axs = plt.subplots(1,2,figsize=(12,6))
axs[0].pie(top_states_churn["count"], labels=top_states_churn["State"],autopct='%1.0f%%')
axs[1].pie(top_states_not_churn["count"], labels=top_states_not_churn["State"],autopct='%1.0f%%')
axs[0].set_title("Top 5 States with Churn")
axs[1].set_title("Top 5 States without Churn")
plt.show()

In [None]:
sns.pairplot(data=data,hue="Churn")

In [None]:
data['Churn_bin'] = data['Churn']
corr= data.corr(numeric_only=True)['Churn_bin'].drop('Churn_bin').sort_values(key=lambda x: x.abs(), ascending=False)
print(corr)

In [None]:
corr = data.select_dtypes(include=["int64"]).corr()
plt.figure(figsize=(12,8))
sns.heatmap(data=corr,annot=True,cmap="coolwarm",fmt=".2f")