# Анализ вакцинирования от COVID-19

## В этом ноутбуке:
* Обработка данных (в частности: фильтрация, обработка пропусков)
* Визуализация (в том числе с анимацией)
* Анализ прогресса в вакцинации
* Анализ использования вакцин в странах мира
* Рейтиг стран по уровню вакцинации
* Связь вакцинации и политических, экономических и демографических показателей
* Кто следующий начнёт вакцинацию?

## Подготовка всего необходимого для анализа

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import os
import plotly.express as px
import plotly.graph_objs as go

In [2]:
vaccination_data = pd.read_csv('../input/covid-world-vaccination-progress/country_vaccinations.csv')
covid_data = pd.read_csv('../input/corona-virus-report/country_wise_latest.csv')
population_data = pd.read_csv('../input/population-by-country-2020/population_by_country_2020.csv')
country_feature_data = pd.read_csv('../input/undata-country-profiles/country_profile_variables.csv')

Как мы можем заметить, датасет имеет много пропущенных значений. Для некоторх стран данные указываются не ежедневно.

In [3]:
vaccination_data = vaccination_data.drop(['iso_code', 'source_name', 'source_website', 'people_vaccinated', 
                              'people_fully_vaccinated', 'daily_vaccinations_raw', 'people_vaccinated_per_hundred', 
                              'total_vaccinations_per_hundred', 'people_fully_vaccinated_per_hundred', 
                                'daily_vaccinations_per_million'], axis='columns')

In [4]:
vaccination_data = vaccination_data.loc[(vaccination_data['country'] !='England') & 
                                        (vaccination_data['country'] != 'Scotland') & 
                                        (vaccination_data['country'] != 'Wales') & 
                                        (vaccination_data['country'] != 'Northern Ireland')]

# Какие вакцины самые распространённые?

In [5]:
vaccine_list = list(vaccination_data['vaccines'].unique())
simple_vaccine_list = []
for vaccine in vaccine_list:
    if ', ' in vaccine:
        simple_vaccine_list.extend(vaccine.split(', '))
    else:
        simple_vaccine_list.append(vaccine)
simple_vaccine_list = list(set(simple_vaccine_list))

In [6]:
def dict_sort_val(d):
    list_d = list(d.items())
    list_d.sort(key=lambda i: i[1])
    return dict(list_d)

In [7]:
country_vaccine_list = [[] for i in range(len(simple_vaccine_list))]
for index, row in vaccination_data.iterrows():
    for num, vaccine in enumerate(simple_vaccine_list):
        if vaccine in row['vaccines']:
            if row['country'] not in country_vaccine_list[num]:
                country_vaccine_list[num].append(row['country'])
count_country_vaccine_list = [len(x) for x in country_vaccine_list]
vaccine_dict = {simple_vaccine_list[i]: count_country_vaccine_list[i] for i in range(len(simple_vaccine_list))}
vaccine_dict = dict_sort_val(vaccine_dict)

In [8]:
px.histogram(y=vaccine_dict.keys(), x=vaccine_dict.values(), orientation='h', 
             labels={'x': 'country', 'y': 'vaccine'}, title='Distribution of vaccines by country')

In [9]:
start_date = vaccination_data['date'].min()
last_date = vaccination_data['date'].max()
date_list = []
for i in pd.date_range(start_date, last_date):
    date_list.append(str(i)[0:10])

In [10]:
svp, dl, cc,= [], [], []
for vac in simple_vaccine_list:
    c = set()
    for date in date_list:
        svp.append(vac)
        dl.append(date)
        for index, row in vaccination_data.iterrows():
            if vac in row['vaccines'] and row['date'] == date:
                c.add(row['country'])
        cc.append(len(set(c)))
simple_vaccine_progress = pd.DataFrame({'vaccine': svp, 'date': dl, 'count_country': cc})

In [11]:
px.line(simple_vaccine_progress, x='date', y='count_country', color='vaccine', title='Progress of disribution of vaccines')

In [12]:
country_set = set()
vac_country_dict = {}
d, c, v =[], [], []
for date in date_list:
    for index, row in vaccination_data.iterrows():
        if row['date'] == date:
            country_set.add(row['country'])
        if vac_country_dict.get(row['country']) is None:
            vac_country_dict.update([(row['country'], row['vaccines'])])
    for i in range(len(country_set)):
        d.append(date)
        c.append(list(country_set)[i])
        v.append(vac_country_dict[list(country_set)[i]])
vaccine_country = pd.DataFrame({'date': d, 'country': c, 'vaccines': v})

In [13]:
count_country_vaccinated = {date: len(vaccine_country.query('date == @date'))for date in date_list}
last_day_vaccine_country = vaccine_country.query('date == @last_date')
count_vaccine_country = {vaccines: len(last_day_vaccine_country.query('vaccines == @vaccines'))for vaccines in vaccine_list}

In [14]:
def Insert_row(row_number, df, row_value):
    start_upper = 0
    end_upper = row_number
    start_lower = row_number
    end_lower = df.shape[0]
    upper_half = [*range(start_upper, end_upper, 1)]
    lower_half = [*range(start_lower, end_lower, 1)]
    lower_half = [x.__add__(1) for x in lower_half]
    index_ = upper_half + lower_half
    df.index = index_
    df.loc[row_number] = row_value
    df = df.sort_index()
    return df

In [15]:
lvl = set(vaccine_list)
for index, row in vaccine_country.iterrows():
    if row['date'] != start_date:
        break
    lvl.discard(row['vaccines'])
for i in list(lvl):
    Insert_row(0, vaccine_country, [start_date, 'Null', i])

In [16]:
px.choropleth(vaccine_country, locations='country', locationmode='country names', color='vaccines', animation_frame='date', 
              animation_group='vaccines', title='Vaccination in the world')

In [17]:
px.line(x=count_vaccine_country.keys(), y=count_vaccine_country.values(), 
        labels={'x': 'Date', 'y': 'Number of countries'}, title='Number of countries that have started vaccination')

In [18]:
px.pie(names=count_vaccine_country.keys(), values=count_vaccine_country.values(), title='Prevalence of the vaccine combination in different countries', hole=.5)

* График количества привитых конкретными вакцинами (после восстановления зависимости)
* График ежедневного вакцинирования конкретными вакцинами (после восстановления зависимости)

In [19]:
vaccination_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1480 entries, 0 to 1607
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   country             1480 non-null   object 
 1   date                1480 non-null   object 
 2   total_vaccinations  991 non-null    float64
 3   daily_vaccinations  1421 non-null   float64
 4   vaccines            1480 non-null   object 
dtypes: float64(2), object(3)
memory usage: 69.4+ KB


In [20]:
vaccination_data = vaccination_data.interpolate(method ='linear', limit_direction ='forward')
vaccination_data = vaccination_data.dropna(axis=0)
vaccination_data

Unnamed: 0,country,date,total_vaccinations,daily_vaccinations,vaccines
1,Argentina,2020-12-30,16356.5,15656.0,Sputnik V
2,Argentina,2020-12-31,32013.0,15656.0,Sputnik V
3,Argentina,2021-01-01,33909.5,11070.0,Sputnik V
4,Argentina,2021-01-02,35806.0,8776.0,Sputnik V
5,Argentina,2021-01-03,37702.5,7400.0,Sputnik V
...,...,...,...,...,...
1603,United States,2021-01-22,19107959.0,975540.0,"Moderna, Pfizer/BioNTech"
1604,United States,2021-01-23,20537990.0,1057387.0,"Moderna, Pfizer/BioNTech"
1605,United States,2021-01-24,21848655.0,1122182.0,"Moderna, Pfizer/BioNTech"
1606,United States,2021-01-25,22734243.0,1126251.0,"Moderna, Pfizer/BioNTech"


In [21]:
vaccinated_country_list = []
end = False
while not end:
    for global_index, row in vaccination_data.iterrows():
        if row['country'] not in vaccinated_country_list:
            first_country_date = row['date']
            vaccinated_country_list.append(row['country'])
            country_vaccine = row['vaccines']
            daterange = pd.date_range(start_date, first_country_date)[:-1]
            inserted_list = []
            for date in date_list:
                inserted_list.append([row['country'], date, 0, 0, country_vaccine])
            for index, value in enumerate(inserted_list):
                vaccination_data = Insert_row(index + global_index - 1, vaccination_data, value)
            break   
        if global_index == vaccination_data.index[-1]:
            end = True

In [22]:
vaccination_data.head(50)

Unnamed: 0,country,date,total_vaccinations,daily_vaccinations,vaccines
0,Argentina,2020-12-15,0.0,0.0,Sputnik V
1,Argentina,2020-12-16,0.0,0.0,Sputnik V
2,Argentina,2020-12-17,0.0,0.0,Sputnik V
3,Argentina,2020-12-18,0.0,0.0,Sputnik V
4,Argentina,2020-12-19,0.0,0.0,Sputnik V
5,Argentina,2020-12-20,0.0,0.0,Sputnik V
6,Argentina,2020-12-21,0.0,0.0,Sputnik V
7,Argentina,2020-12-22,0.0,0.0,Sputnik V
8,Argentina,2020-12-23,0.0,0.0,Sputnik V
9,Argentina,2020-12-24,0.0,0.0,Sputnik V


In [23]:
px.line(vaccination_data, x='date', y='total_vaccinations', color='country', title='Progress of disribution of vaccines')

In [24]:
px.line(vaccination_data, x='date', y='daily_vaccinations', color='country', title='Progress of disribution of vaccines')

In [25]:
px.scatter(vaccination_data, x="daily_vaccinations", y="daily_vaccinations_per_million", animation_frame='date', animation_group="country",
           size="total_vaccinations", color="vaccines", hover_name="country",
           log_x=True, log_y=True, size_max=55, range_x=[1000,3000000], range_y=[1, 35000])

ValueError: Value of 'y' is not the name of a column in 'data_frame'. Expected one of ['country', 'date', 'total_vaccinations', 'daily_vaccinations', 'vaccines'] but received: daily_vaccinations_per_million