In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

**Step-2: Read Dataset**

In [None]:
df = pd.read_csv("Life Expectancy Data.csv")

In [None]:
df.head()

**Step-3: Sanity Check of Data**

In [None]:
#shape
df.shape

In [None]:
#info
df.info()

In [None]:
#finding missing values
df.isnull().sum()

In [None]:
# missing value percentage
missing_percentage = (df.isnull().sum() / len(df)) * 100
missing_percentage

In [None]:
#finding duplicate values
df.duplicated().sum()
# finding duplicate rows
# duplicate_rows = df[df.duplicated()]
# duplicate_rows

In [None]:
#identifying garbage values; garbage values are always in object type columns
for i in df.select_dtypes(include="object").columns:
    print(df[i].value_counts())
    print("***"*15)

**Step-4: Exploratory Data Analysis (EDA)**

In [None]:
#descriptive statistics
df.describe().T

In [None]:
# descriptive statistics for object type columns
df.describe(include="object").T

In [None]:
# Histogram for numerical columns
for i in df.select_dtypes(include="number").columns:
    sns.histplot(data=df, x=i)
    plt.show()

In [None]:
#Boxplot for identifying outliers
for i in df.select_dtypes(include="number").columns:
    sns.boxplot(data=df, x=i)
    plt.show()

In [None]:
# scatter plot to understnad relationship
# select all numeric columns except target column
numeric_cols = df.select_dtypes(include="number").columns.tolist()
numeric_cols.remove("Life expectancy ")
numeric_cols

In [None]:
for col in numeric_cols:
    plt.figure(figsize=(6,4))
    sns.scatterplot(data=df, x=col, y="Life expectancy ")
    plt.title(f"Scatter plot between {col} and Life expectancy")
    plt.show()

In [None]:
# correlation with heatmap to interpret relationship and multicollinearity
s = df.select_dtypes(include="number").corr()
plt.figure(figsize=(16,16))
sns.heatmap(s, annot=True)

**Step-5: Handling Missing Values**

In [None]:
# use mean, median, mode or KNNImputer to fill missing values
for i in ["Polio", "Income composition of resources"]:
    df[i].fillna(df[i].median(), inplace=True)

In [None]:
#using KNNImputer to fill missing values
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=3)

for i in df.select_dtypes(include="number").columns:
    df[[i]] = imputer.fit_transform(df[[i]])

In [None]:
df.isnull().sum()

**Step-6: Outlier Treatment**

In [None]:
def wisker(col):
    Q1 = np.percentile(col, 25)
    Q3 = np.percentile(col, 75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return lower_bound, upper_bound

In [None]:
# We don't do outlier treatment on target column, categorical columns or discrete columns
# So, we will do outlier treatment on continuous numeric columns only
for i in ['GDP','Total expenditure',' thinness  1-19 years', ' thinness 5-9 years']:
    lw,uw= wisker(df[i])
    df[i] = np.where(df[i]<lw, lw, df[i])
    df[i] = np.where(df[i]>uw, uw, df[i])
    

In [None]:
# See the boxplot again after outlier treatment
for i in ['GDP','Total expenditure',' thinness  1-19 years', ' thinness 5-9 years']:
    sns.boxplot(data=df, x=i)
    plt.show()

**Step-7: Duplicates and Garbage value**

In [None]:
df.drop_duplicates(inplace=True)

**Step-8: Encoding of Data**

In [None]:
# #Label Encoding
# from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()
# for i in df.select_dtypes(include="object").columns:
#     df[i] = le.fit_transform(df[i])

# df


In [None]:
# One Hot Encoding
dummy = pd.get_dummies(df, columns= ["Country", "Status"], drop_first=True)
dummy

**Step-9: Normalization**

In [None]:
# Normalization
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(dummy)
normalized_df = pd.DataFrame(scaled_data, columns=dummy.columns)
normalized_df