In [1]:
import pandas as pd 
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [5]:
def examine_data(df):
    
    print("\nBasic Information:")
    print(df.info())
    
    print("\nMissing Values:")
    print(df.isnull().sum())
    
    print("\nDuplicate Rows:", df.duplicated().sum())
    
    print("\nSummary Statistics:")
    print(df.describe())
    
    return df

df = pd.read_csv('https://raw.githubusercontent.com/datasets/covid-19/master/data/countries-aggregated.csv')
examine_data(df)


Basic Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161568 entries, 0 to 161567
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   Date       161568 non-null  object
 1   Country    161568 non-null  object
 2   Confirmed  161568 non-null  int64 
 3   Recovered  161568 non-null  int64 
 4   Deaths     161568 non-null  int64 
dtypes: int64(3), object(2)
memory usage: 6.2+ MB
None

Missing Values:
Date         0
Country      0
Confirmed    0
Recovered    0
Deaths       0
dtype: int64

Duplicate Rows: 0

Summary Statistics:
          Confirmed     Recovered         Deaths
count  1.615680e+05  1.615680e+05  161568.000000
mean   7.361569e+05  1.453967e+05   13999.436089
std    3.578884e+06  9.748275e+05   59113.581271
min    0.000000e+00  0.000000e+00       0.000000
25%    1.220000e+03  0.000000e+00      17.000000
50%    2.369200e+04  1.260000e+02     365.000000
75%    2.558420e+05  1.797225e+04    4509.00000

Unnamed: 0,Date,Country,Confirmed,Recovered,Deaths
0,2020-01-22,Afghanistan,0,0,0
1,2020-01-23,Afghanistan,0,0,0
2,2020-01-24,Afghanistan,0,0,0
3,2020-01-25,Afghanistan,0,0,0
4,2020-01-26,Afghanistan,0,0,0
...,...,...,...,...,...
161563,2022-04-12,Zimbabwe,247094,0,5460
161564,2022-04-13,Zimbabwe,247160,0,5460
161565,2022-04-14,Zimbabwe,247208,0,5462
161566,2022-04-15,Zimbabwe,247237,0,5462


In [6]:
def clean_data(df):
    
    df_clean = df.copy()
    
    numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
    
    categorical_cols = df_clean.select_dtypes(exclude=[np.number]).columns
    
    
    for col in numeric_cols:
        df_clean[col] = df_clean[col].fillna(df_clean[col].median())
        
    for col in categorical_cols:
        df_clean[col] = df_clean[col].fillna(df_clean[col].mode()[0])
        
    return df_clean

df_cleaned = clean_data(df)

print("\nMissing values after cleaning:")
print(df_cleaned.isnull().sum())


Missing values after cleaning:
Date         0
Country      0
Confirmed    0
Recovered    0
Deaths       0
dtype: int64


In [None]:
def normalize_data(df, columns_to_normalize):
    df_normalized = df.copy()
    
    scaler = StandardScaler()
    df_normalized[f'{columns_to_normalize}']