In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

In [2]:
# Load the dataset
df = pd.read_csv("covid_vaccination_eu.csv")

In [3]:
df

Unnamed: 0,YearWeekISO,ReportingCountry,Denominator,NumberDosesReceived,NumberDosesExported,FirstDose,FirstDoseRefused,SecondDose,DoseAdditional1,DoseAdditional2,DoseAdditional3,DoseAdditional4,DoseAdditional5,UnknownDose,Region,TargetGroup,Vaccine,Population
0,2022-W30,AT,7427239.0,0.0,0.0,7,,1,4,13,0,0,0,0,AT,ALL,JANSS,8978929
1,2022-W22,AT,1551690.0,0.0,0.0,0,,0,0,0,0,0,0,0,AT,Age<18,JANSS,8978929
2,2022-W22,AT,7427239.0,0.0,0.0,7,,1,10,30,0,0,0,0,AT,ALL,JANSS,8978929
3,2022-W22,AT,431785.0,0.0,0.0,0,,0,0,0,0,0,0,0,AT,Age0_4,MOD,8978929
4,2022-W22,AT,424628.0,0.0,0.0,0,,0,1,0,0,0,0,0,AT,Age10_14,MOD,8978929
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
815592,2021-W34,SK,183656.0,0.0,0.0,332,,430,0,0,0,0,0,0,SK,Age80+,COM,5459781
815593,2021-W34,SK,422889.0,0.0,0.0,768,,1070,0,0,0,0,0,0,SK,Age70_79,COM,5459781
815594,2021-W34,SK,686528.0,0.0,0.0,1507,,2496,0,0,0,0,0,0,SK,Age60_69,COM,5459781
815595,2021-W16,SK,422889.0,0.0,0.0,10,,2,0,0,0,0,0,0,SK,Age70_79,AZ,5459781


### a) Identify which variables are categorical, discrete and continuous in the chosen data set and show using some visualization or plot. Explore whether there are missing values for any of the variables.

In [4]:
# Identify variables
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 815597 entries, 0 to 815596
Data columns (total 18 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   YearWeekISO          815597 non-null  object 
 1   ReportingCountry     815597 non-null  object 
 2   Denominator          439052 non-null  float64
 3   NumberDosesReceived  108520 non-null  float64
 4   NumberDosesExported  106322 non-null  float64
 5   FirstDose            815597 non-null  int64  
 6   FirstDoseRefused     1447 non-null    float64
 7   SecondDose           815597 non-null  int64  
 8   DoseAdditional1      815597 non-null  int64  
 9   DoseAdditional2      815597 non-null  int64  
 10  DoseAdditional3      815597 non-null  int64  
 11  DoseAdditional4      815597 non-null  int64  
 12  DoseAdditional5      815597 non-null  int64  
 13  UnknownDose          815597 non-null  int64  
 14  Region               815597 non-null  object 
 15  TargetGroup      

In [5]:
# Check for missing values
print(df.isnull().sum())

YearWeekISO                 0
ReportingCountry            0
Denominator            376545
NumberDosesReceived    707077
NumberDosesExported    709275
FirstDose                   0
FirstDoseRefused       814150
SecondDose                  0
DoseAdditional1             0
DoseAdditional2             0
DoseAdditional3             0
DoseAdditional4             0
DoseAdditional5             0
UnknownDose                 0
Region                      0
TargetGroup                 0
Vaccine                     0
Population                  0
dtype: int64


In [6]:
# Drop columns

# 'Denominator' and 'Vaccine' were both deleted for representing no relevance for the specific analysis
# 'FirstDoseRefused' was deleted because it was full with null values and wouldn't contribute to the analysis
# 'Region' was deleted because is irrelevant for our analysis and we can use ReportinCountry instead
df = df.drop(columns=["Denominator", "FirstDoseRefused", "Region", "Vaccine"])

In [7]:
print(df.isnull().sum())

YearWeekISO                 0
ReportingCountry            0
NumberDosesReceived    707077
NumberDosesExported    709275
FirstDose                   0
SecondDose                  0
DoseAdditional1             0
DoseAdditional2             0
DoseAdditional3             0
DoseAdditional4             0
DoseAdditional5             0
UnknownDose                 0
TargetGroup                 0
Population                  0
dtype: int64


In [8]:
# Filled empty column values with zeros
df["NumberDosesReceived"] = df["NumberDosesReceived"].fillna(0)
df["NumberDosesExported"] = df["NumberDosesExported"].fillna(0)

In [9]:
print(df.isnull().sum())

YearWeekISO            0
ReportingCountry       0
NumberDosesReceived    0
NumberDosesExported    0
FirstDose              0
SecondDose             0
DoseAdditional1        0
DoseAdditional2        0
DoseAdditional3        0
DoseAdditional4        0
DoseAdditional5        0
UnknownDose            0
TargetGroup            0
Population             0
dtype: int64


In [10]:
# After cleaning and preparing the dataset, it will be 815597 lines and 15 columns long
df.shape

(815597, 14)

### b) Calculate the statistical parameters (mean, median, minimum, maximum, and standard deviation) for each of the numerical variables.

In [11]:
print(df.describe())

       NumberDosesReceived  NumberDosesExported     FirstDose    SecondDose  \
count         8.155970e+05         8.155970e+05  8.155970e+05  8.155970e+05   
mean          1.842307e+03         1.110302e+02  1.144593e+03  1.041615e+03   
std           6.466648e+04         1.446742e+04  2.318690e+04  2.232044e+04   
min           0.000000e+00         0.000000e+00  0.000000e+00  0.000000e+00   
25%           0.000000e+00         0.000000e+00  0.000000e+00  0.000000e+00   
50%           0.000000e+00         0.000000e+00  0.000000e+00  0.000000e+00   
75%           0.000000e+00         0.000000e+00  1.000000e+00  0.000000e+00   
max           1.889280e+07         6.488820e+06  4.021725e+06  4.149209e+06   

       DoseAdditional1  DoseAdditional2  DoseAdditional3  DoseAdditional4  \
count     8.155970e+05     8.155970e+05    815597.000000    815597.000000   
mean      8.260828e+02     2.211480e+02        33.021870         0.753810   
std       2.529863e+04     5.452854e+03      1321.952954 

### c) Apply Min-Max Normalization, Z-score Standardization and Robust scalar on the numerical data variables.

In [12]:
# Min-Max
scaler = MinMaxScaler()
df_minmax = pd.DataFrame(
    scaler.fit_transform(df.select_dtypes(include=[np.number])),
    columns=df.select_dtypes(include=[np.number]).columns,
)

print(df_minmax.describe())

       NumberDosesReceived  NumberDosesExported     FirstDose     SecondDose  \
count        815597.000000        815597.000000  8.155970e+05  815597.000000   
mean              0.000098             0.000017  2.846026e-04       0.000251   
std               0.003423             0.002230  5.765412e-03       0.005379   
min               0.000000             0.000000  0.000000e+00       0.000000   
25%               0.000000             0.000000  0.000000e+00       0.000000   
50%               0.000000             0.000000  0.000000e+00       0.000000   
75%               0.000000             0.000000  2.486495e-07       0.000000   
max               1.000000             1.000000  1.000000e+00       1.000000   

       DoseAdditional1  DoseAdditional2  DoseAdditional3  DoseAdditional4  \
count    815597.000000    815597.000000    815597.000000    815597.000000   
mean          0.000124         0.000177         0.000098         0.000022   
std           0.003806         0.004371         

In [13]:
# Z-score
scaler = StandardScaler()
df_zscore = pd.DataFrame(
    scaler.fit_transform(df.select_dtypes(include=[np.number])),
    columns=df.select_dtypes(include=[np.number]).columns,
)

print(df_zscore.describe())

       NumberDosesReceived  NumberDosesExported     FirstDose    SecondDose  \
count         8.155970e+05         8.155970e+05  8.155970e+05  8.155970e+05   
mean         -2.520412e-14         4.184000e-14  3.103197e-14  6.341251e-14   
std           1.000001e+00         1.000001e+00  1.000001e+00  1.000001e+00   
min          -2.848938e-02        -7.674501e-03 -4.936383e-02 -4.666644e-02   
25%          -2.848938e-02        -7.674501e-03 -4.936383e-02 -4.666644e-02   
50%          -2.848938e-02        -7.674501e-03 -4.936383e-02 -4.666644e-02   
75%          -2.848938e-02        -7.674501e-03 -4.932070e-02 -4.666644e-02   
max           2.921293e+02         4.485052e+02  1.733989e+02  1.858462e+02   

       DoseAdditional1  DoseAdditional2  DoseAdditional3  DoseAdditional4  \
count     8.155970e+05     8.155970e+05     8.155970e+05     8.155970e+05   
mean     -3.327864e-14     5.638211e-14     9.029824e-16     2.462484e-14   
std       1.000001e+00     1.000001e+00     1.000001e+00 

In [14]:
# Robust scalar
scaler = RobustScaler()
df_robust = pd.DataFrame(
    scaler.fit_transform(df.select_dtypes(include=[np.number])),
    columns=df.select_dtypes(include=[np.number]).columns,
)

print(df_robust.describe())

       NumberDosesReceived  NumberDosesExported     FirstDose    SecondDose  \
count         8.155970e+05         8.155970e+05  8.155970e+05  8.155970e+05   
mean          1.842307e+03         1.110302e+02  1.144593e+03  1.041615e+03   
std           6.466648e+04         1.446742e+04  2.318690e+04  2.232044e+04   
min           0.000000e+00         0.000000e+00  0.000000e+00  0.000000e+00   
25%           0.000000e+00         0.000000e+00  0.000000e+00  0.000000e+00   
50%           0.000000e+00         0.000000e+00  0.000000e+00  0.000000e+00   
75%           0.000000e+00         0.000000e+00  1.000000e+00  0.000000e+00   
max           1.889280e+07         6.488820e+06  4.021725e+06  4.149209e+06   

       DoseAdditional1  DoseAdditional2  DoseAdditional3  DoseAdditional4  \
count     8.155970e+05     8.155970e+05    815597.000000    815597.000000   
mean      8.260828e+02     2.211480e+02        33.021870         0.753810   
std       2.529863e+04     5.452854e+03      1321.952954 