# Exploratory Data Analysis (EDA)

## Import relevent libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Change display settings

In [None]:
pd.set_option("display.max_columns", None)
sns.set(style="whitegrid")

## Load Dataset

In [None]:
df = pd.read_csv("../data/raw/WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.head()

## Check data types and general info

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df["Churn"].value_counts()

In [None]:
df["Churn"].value_counts(normalize=True)

## check for missing values

In [None]:
#for check missing values
df.isnull().sum()

### change Totalcharges data type

In [None]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [None]:
df['TotalCharges'].dtype

In [None]:
df['TotalCharges'].isnull().sum()

In [None]:
df = df.dropna(subset=['TotalCharges'])

In [None]:
df['TotalCharges'].isnull().sum()

## Target variable distribution

In [None]:
plt.figure(figsize=(5,4))
sns.countplot(x="Churn", data=df)
plt.title("Churn Distribution")
plt.show()

## Relationship analysis For numerical features

In [None]:
plt.figure(figsize=(15, 4))
for i, col in enumerate (['tenure','MonthlyCharges','TotalCharges']):
    plt.subplot(1,3, i+1)
    sns.boxplot(x='Churn', y=col, data=df)
    plt.title(f"{col} by Churn")
plt.tight_layout()
plt.show()

## Plot the categorical features

In [None]:
cat_cols = ['gender','SeniorCitizen','Partner','Dependents','PhoneService',
            'MultipleLines','InternetService','OnlineSecurity','OnlineBackup',
            'DeviceProtection','TechSupport','StreamingTV','StreamingMovies',
            'Contract','PaperlessBilling','PaymentMethod']

In [None]:
plt.figure(figsize=(15, 20))

for i, col in enumerate(cat_cols):
    plt.subplot(6, 3, i+1)
    sns.countplot(data=df, x=col, hue='Churn')
    plt.title(f"{col} vs Churn")
    plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

## Correlation Analysis for Numerical Features

In [None]:
df['Churn_num'] = df['Churn'].map({'Yes':1, 'No':0})
df[['Churn','Churn_num']].head()

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(
    df[['tenure','MonthlyCharges','TotalCharges','Churn_num']].corr(),
    annot=True,
    cmap='coolwarm',
    fmt=".2f"
)
plt.title("Correlation Heatmap")
plt.show()

## Summary of EDA
* Churn rate is ~26%, This shows class imbalance.
* The tenure has a strong negative relationship with churn.
* There is a positive relationship between MonthlyCharges and the variable churn.
* Senior citizens and customers without long-term contracts show higher churn.
* In the numeric variables, outliers are retained because they are valuable users of the service and not errors in the variables.

## Implications for Modeling
* Class imbalance handling may be required.
* Tree based models may capture non-linear relationships better.
* Tenure and contract type are expected to be strong predictors.