# Objective : Customer Churn Prediction - EDA
# EDA - Python
# Insights - Patterns
# Classification 

![image.png](attachment:image.png)

# 1. Load Python Modules

In [5]:
#  Use Python's import statement to load modules 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from tabulate import tabulate
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

# Importing the SimpleImputer class
from sklearn.impute import SimpleImputer

# 2. Read the Dataset from csv/excel file  - Using Pandas 

### 2.1 CSV

In [6]:
file_path=r"Salary_Data-Salary_Data.csv"
df_csv = pd.read_csv(file_path)
df_csv

Unnamed: 0,YearsExperience,Salary
0,1.1,39343
1,1.3,46205
2,1.5,37731
3,2.0,43525
4,2.2,39891
5,2.9,56642
6,3.0,60150
7,3.2,54445
8,3.2,64445
9,3.7,57189


  ### 2.2 XLSX

In [7]:
file_path=r"Salary_Data-XLSX.xlsx"
df_excel = pd.read_excel(file_path)
df_excel

Unnamed: 0,YearsExperience,Salary
0,1.1,39343
1,1.3,46205
2,1.5,37731
3,2.0,43525
4,2.2,39891
5,2.9,56642
6,3.0,60150
7,3.2,54445
8,3.2,64445
9,3.7,57189


### 2.3 url

In [8]:
url = "https://raw.githubusercontent.com/alexvatti/full-stack-data-science/main/Internship-Exercises/ML-Internship/Regression/Salary-Regression/Salary_Data-Salary_Data.csv"
df_url = pd.read_csv(url)
df_url

Unnamed: 0,YearsExperience,Salary
0,1.1,39343
1,1.3,46205
2,1.5,37731
3,2.0,43525
4,2.2,39891
5,2.9,56642
6,3.0,60150
7,3.2,54445
8,3.2,64445
9,3.7,57189


### 2.4 Json

In [9]:
file_path=r"iris.json"
df_json = pd.read_json(file_path)
df_json

Unnamed: 0,sepalLength,sepalWidth,petalLength,petalWidth,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [10]:
df=df_json.copy()

# 3. Basic Inspection on given dataset
- Head , tail , sample
- Shape , len , size
- dtypes , columns , info
- isnull().sum() , isna().sum()

In [11]:
def basic_inspection_dataset(table):
    """Generates a basic inspection dataset from the given table."""
    
    print("top 5 rows - using head")
    print(table.head())
    print()
        
    print("bottom 5 rows using tail")
    print(table.tail())
    print()
        
    print("numbers of samples and columns")
    print(table.shape)
    print()
    
    print("numbers of samples ")
    print(len(table))
    print()
    
    print("numbers of entries in the data frame")
    print(table.size)
    print()
    
    print("Columns Names")
    print(table.columns)
    print()

    print("Columns dtypes")
    print(table.dtypes)
    print()
    
    print("Dataframe info")
    print(table.info())
    print()
    
    print()
    print("check the missing value in each column")
    print(table.isnull().sum())
          
    print()
    print("check the missing value in each column")
    print(table.isna().sum())
    
    print()
    print("table summary ")
    print(table.describe())

basic_inspection_dataset(df)

top 5 rows - using head
   sepalLength  sepalWidth  petalLength  petalWidth species
0          5.1         3.5          1.4         0.2  setosa
1          4.9         3.0          1.4         0.2  setosa
2          4.7         3.2          1.3         0.2  setosa
3          4.6         3.1          1.5         0.2  setosa
4          5.0         3.6          1.4         0.2  setosa

bottom 5 rows using tail
     sepalLength  sepalWidth  petalLength  petalWidth    species
145          6.7         3.0          5.2         2.3  virginica
146          6.3         2.5          5.0         1.9  virginica
147          6.5         3.0          5.2         2.0  virginica
148          6.2         3.4          5.4         2.3  virginica
149          5.9         3.0          5.1         1.8  virginica

numbers of samples and columns
(150, 5)

numbers of samples 
150

numbers of entries in the data frame
750

Columns Names
Index(['sepalLength', 'sepalWidth', 'petalLength', 'petalWidth', 'species'], 

### 3.1 Non-Significant columns - we need to drop here.
 - we can not find any insights from them


# 4. Handling Missing Values - Cat - Variables
 - Replace with Mode - One of the Method

In [12]:
missing_vals_cat_columns=[]
cat_columns = df.select_dtypes(include='object').columns
for cat_var in cat_columns:
    print(f"{cat_var} missing values")
    print(df[cat_var].isna().sum())
    if df[cat_var].isna().sum() !=0:
        missing_vals_cat_columns.append(cat_var)

species missing values
0


### 4.1 fillna

In [13]:
for cat_var in missing_vals_cat_columns:
    print(f"{cat_var} fill missing values - mode")
    df[cat_var].fillna(df[cat_var].mode()[0],inplace=True)

### 4.2 Simple Imputer

#### mean  median , mode

In [14]:
mode_imputer = SimpleImputer(strategy='most_frequent')
for cat_var in missing_vals_cat_columns:
    result_imputer = mean_imputer.fit_transform(df[[cat_var]])

# 5. Categorical- UniVariable - Analysis -Using Pipeline
   - Frequency Table 
   - Bar Graph
   - Pie Graph

In [None]:
class BarPieChartTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df=X.copy()
        cat_cols = df.select_dtypes(include='object').columns
        for cat_name in cat_cols:
            value_counts = df[cat_name].value_counts().reset_index()
            # Rename the columns
            value_counts.columns = ['Class', 'Frequency']

            # Print the result as a table
            print(f"{cat_name} frequency table")
            print(tabulate(value_counts, headers='keys', tablefmt='pretty'))

            # Calculate relative frequency
            total_count = value_counts['Frequency'].sum()
            value_counts['Relative Frequency %'] = round((value_counts['Frequency'] / total_count)*100,2)

            # Print the result as a table
            print(f"{cat_name} Relative frequency table")
            print(tabulate(value_counts, headers='keys', tablefmt='pretty'))

            # Extract the values and index from value counts
            value_counts = df[cat_name].value_counts()
            values = value_counts.values
            labels = value_counts.index

            fig, axs = plt.subplots(1, 2, figsize=(18, 6))  # 1 row, 2 columns
            # Create a bar graph
            axs[0].bar(labels, values)
            axs[0].set_title(f'Frequency of {cat_name}')
            axs[0].set_xlabel('Categories')  # Set x-label
            axs[0].set_ylabel('Count')       # Set y-label

            axs[1].pie(value_counts.values, labels=value_counts.index, autopct='%0.2f%%', startangle=40)
            axs[1].set_title(f'Relative Frequency of {cat_name}')
            plt.tight_layout()
            # Show the plot
            plt.show()  
        
        

In [None]:
pipeline_cat_var = Pipeline([
    ('bar_pie_chart', BarPieChartTransformer())
])


# Fit and transform your data using the pipeline
processed_data = pipeline_cat_var.fit_transform(df)

# 6. Handling Missing Values in Numerical Columns
- Replace with Median - One of the Method

# 7. Numerical - UniVariable - Analysis - Using -Pipeline
- Frequency Interval - Data Distribution Table
- Histogram Graph
- Hist with KDE Graph
- Box Plots

In [None]:
class HistBoxChartTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df=X.copy()
        num_cols = df.select_dtypes(exclude='object').columns
        for con_var in num_cols:
               
            
            # Create a figure and axes object
            fig, axes = plt.subplots(1, 2, figsize=(14, 6))

            # Plot histogram without KDE on the left
            axes[0].hist(df[con_var], color='skyblue', edgecolor='black')
            axes[0].set_xlabel('Value')
            axes[0].set_ylabel('Frequency')
            axes[0].set_title(f'Histogram {con_var}')

            # Plot histogram with KDE on the right
            sns.histplot(data=df, x=con_var, kde=True, color='orange', edgecolor='black', ax=axes[1])
            axes[1].set_xlabel('Value')
            axes[1].set_ylabel('Density')
            axes[1].set_title('Histogram with KDE')

            # Adjust layout
            plt.tight_layout()

            # Show the combined plot
            plt.show()
        


In [None]:
pipeline_num_var = Pipeline([
    ('hist_box_chart', HistBoxChartTransformer())
])

# Fit and transform your data using the pipeline
processed_data = pipeline_num_var.fit_transform(sales_df)

# 8. Numerical - Variables -Outliers Analysis
- IQR Method 
- -3Sigma to +3 Sigma Method

# 9. Bi Variate Analysis

## Cat Vs Cat
 - Crosstab -  Contigency Table
 - Bar Graph with hue


## Cat Vs Num
- Box Plot with hue
- Violin Plot with hue

## Num Vs Num
- Scatter Plot with Hue
- co-relation(co-varaince ,co-relation coffiencnt)
- heatmap

In [None]:
print(df.corr(numeric_only=True))
sns.heatmap(df.corr(numeric_only=True), cmap="YlGnBu", annot=True)
plt.show()

# 10. Data Transformation

### 10.1 left - skewed  to Normal

### 10.2 Right - Skewed  to Normal 

### 10.3 Different distributions to normal distribution

In [None]:
# Columns to scale
columns_to_scale = []

for var in columns_to_scale:
# Fit and transform the scaler on the selected columns
    scaled_column = np.lop1p(df[[var]])
    df[var+"_log"]=scaled_column
print(df)

In [None]:
# Columns to scale
columns_to_scale = []

for var in columns_to_scale:
# Fit and transform the scaler on the selected columns
    scaled_column = np.sqrt(df[[var]])
    df[var+"_sqrt"]=scaled_column
print(df)

# 11. Standization - Normalization
- Min Max Scaler
- Standard Scaler

In [None]:
scaler = MinMaxScaler()

# Columns to scale
columns_to_scale = []

for var in columns_to_scale:
# Fit and transform the scaler on the selected columns
    scaled_column = scaler.fit_transform(df[[var]])
    df[var+"_MinMax"]=scaled_column
print(df)

In [None]:
scaler = StandardScaler()

# Fit and transform the scaler on the selected columns
scaled_columns = scaler.fit_transform(df[['ApplicantIncome_sqrt', 'CoapplicantIncome_sqrt', 'LoanAmount_sqrt']]
# Replace the original columns with th scaled columns
loan_df[['ApplicantIncome_sqrt_stand', 'CoapplicantIncome_sqrt_stand','LoanAmount_sqrt_stand']] = scaled_columns

print(loan_df)

# 12. Convert Cat - to - Numerical Columns
- One-Hot-Enconding
- Label Encoding

### One-Hot-Enconding

In [None]:
df=pd.get_dummies(df)

### Label Encoding

In [None]:
le = LabelEncoder()
for var in var_list:
    df[var]=le.fit_transform(df[var])

# 13. Reduce the Number of Columns
- PCA
- tsne

# 14. Inferential statistics test
- chi-square test
- ANOVA Test
- t-test(one and two tail test)
- biserial correlation coefficient