In [1]:
# Import necessary libraries
import gpxpy.geo #Get the haversine distance
import math
import pickle
import os
import pandas as pd
import folium 
import datetime #Convert to unix time
import time #Convert to unix time
import numpy as np#Do aritmetic operations on arrays
# matplotlib: used to plot graphs
import matplotlib
# matplotlib.use('nbagg') : matplotlib uses this protocall which makes plots more user intractive like zoom in and zoom out
matplotlib.use('nbagg')
import matplotlib.pylab as plt
import seaborn as sns#Plots
from matplotlib import rcParams#Size of plots 
import plotly as py
import cufflinks

### Reading Data

In [2]:
# Reading Data
covid_master=pd.read_csv('covid_19_data.csv')
covid_open=pd.read_csv('COVID19_open_line_list.csv')
covid_confirmed=pd.read_csv('time_series_covid_19_confirmed.csv')
covid_death= pd.read_csv('time_series_covid_19_deaths.csv')
covid_recovered=pd.read_csv('time_series_covid_19_recovered.csv')

## Covid 19 Master Data

The raw data is structured such a way that the last updated row by each country and state contains the latest value for that region and state. Suppose on '02/02/2020' the value of confirmed cases in China/Hubei was 'X' then on '03/02/2020' the value is 'X+y'. So the total confirmed cases will be the sum of all the latest values of each state of each region/country.

### Preprocessing

In [3]:
covid_master.head(3)
#data[data['ObservationDate']=='03/04/2020']

Unnamed: 0,SNo,ObservationDate,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered
0,1,01/22/2020,Anhui,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
1,2,01/22/2020,Beijing,Mainland China,1/22/2020 17:00,14.0,0.0,0.0
2,3,01/22/2020,Chongqing,Mainland China,1/22/2020 17:00,6.0,0.0,0.0


In [4]:
# data columns
#covid_master=covid_master.drop(columns=['SNo'])
covid_master.columns

Index(['SNo', 'ObservationDate', 'Province/State', 'Country/Region',
       'Last Update', 'Confirmed', 'Deaths', 'Recovered'],
      dtype='object')

Checking for Null values

In [5]:
covid_master.isna().sum()

SNo                   0
ObservationDate       0
Province/State     1398
Country/Region        0
Last Update           0
Confirmed             0
Deaths                0
Recovered             0
dtype: int64

In [6]:
# We will replace Null states to a value 'NoState'
covid_master=covid_master.fillna('NoState')
covid_master.isna().sum()

SNo                0
ObservationDate    0
Province/State     0
Country/Region     0
Last Update        0
Confirmed          0
Deaths             0
Recovered          0
dtype: int64

In [7]:
# changing the data type
num_cols=['Confirmed', 'Deaths', 'Recovered']
for col in num_cols:
    temp=[int(i) for i in covid_master[col]]
    covid_master[col]=temp

In [8]:
covid_master.groupby(['Country/Region','Confirmed']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,SNo,Deaths,Recovered
Country/Region,Confirmed,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Azerbaijan,1,2664,0,0
Afghanistan,1,38707,0,0
Algeria,1,15727,0,0
Algeria,3,3023,0,0
Algeria,5,3161,0,0
...,...,...,...,...
Vietnam,13,2110,0,2
Vietnam,14,1163,0,1
Vietnam,15,2543,0,12
Vietnam,16,53661,0,288


In [9]:
# Consolidating unique affected regions till date
unique_regions=[country for country in list(set(covid_master['Country/Region']))]

### Creating country wise cumulative data

In [10]:
# creating list of all affected states of the countries
states_per_regions=[]
for reg in unique_regions:
    states=[]
    for i in  range(len(covid_master)):
        if(covid_master.iloc[i]['Country/Region']==reg):
            states.append(covid_master.iloc[i]['Province/State'])
    states_per_regions.append(set(states))        

In [11]:
# Total Confirmed cases per conutry
total_confirmed=[]
for i in range(len(unique_regions)):
    count=0
    covid_temp=covid_master[covid_master['Country/Region']==unique_regions[i]]
    for state in states_per_regions[i]:
        #print(state)
        count+=covid_temp[covid_temp['Province/State']==state].sort_values('ObservationDate').iloc[-1]['Confirmed']
    total_confirmed.append(count)

In [12]:
# Total Deaths cases per conutry
total_deaths=[]
for i in range(len(unique_regions)):
    count=0
    covid_temp=covid_master[covid_master['Country/Region']==unique_regions[i]]
    for state in states_per_regions[i]:
        #print(state)
        count+=covid_temp[covid_temp['Province/State']==state].sort_values('ObservationDate').iloc[-1]['Deaths']
    total_deaths.append(count)

In [13]:
# Total Recovered cases per conutry
total_recovered=[]
for i in range(len(unique_regions)):
    count=0
    covid_temp=covid_master[covid_master['Country/Region']==unique_regions[i]]
    for state in states_per_regions[i]:
        #print(state)
        count+=covid_temp[covid_temp['Province/State']==state].sort_values('ObservationDate').iloc[-1]['Recovered']
    total_recovered.append(count)

In [14]:
covid_countrywise=pd.DataFrame(columns=['country','confirmed','deaths','recovered'],index=None)
unique_regions[53]='unconfirmed/Diamond princes Cruise'
total_confirmed[53]=1
covid_countrywise.country=unique_regions
covid_countrywise.confirmed=total_confirmed
covid_countrywise.deaths=total_deaths
covid_countrywise.recovered=total_recovered
covid_countrywise.to_csv('covid_countrywise.csv')
covid_countrywise.head()

Unnamed: 0,country,confirmed,deaths,recovered
0,Sweden,161,0,0
1,Greece,46,0,0
2,Lebanon,23,0,1
3,Vietnam,18,0,16
4,Monaco,1,0,0


### Exploratory Data Analysis

Here we will seek answers of 11 key questions to analyze the outbreak closely.
1. Which countries are mostly affected by the outbreak?                                     
2. As we know China is the source of the incident, What is the comparative situation between china and rest of the world?   
3. How the confirmed cases are distributed globally over various regions?                                        
4. How confirmed and death cases are distributed over worstly effected countries?                                     
5. Which countries have the worst death rates?                                                                   
6. Which countries have the best recovery rates?                                                                                                                                      
7. How the virus is spread across various regions?                                                                  
8. How many new incidents (confirmed/ recovered/ Death) are reported on a daily basis?                                    
9. How the virus is spreading outside ot china?                                                                                        
10. Which countries are completely recovered tilll date (All confirmed cases are recovred)?                                
11. Which are the regious where death cases are reported?                                                              


In [25]:
covid_countrywise.iloc[53]['Country']='unconfirmed/Diamond princes Cruise'
covid_countrywise.sort_values('confirmed',ascending=False).head(6)

Unnamed: 0,country,confirmed,deaths,recovered
46,Mainland China,80652,3070,55478
44,South Korea,7041,44,135
32,Italy,5883,233,589
47,Iran,5823,145,1669
85,France,949,11,12
13,Germany,804,0,18


#### Q1. Which countries are mostly hit by COVID?

In [26]:
import plotly.express as px
data=covid_countrywise.sort_values('confirmed',ascending=False)[0:5]

fig = px.bar(data, x='country', y='confirmed',
             hover_data=['country','confirmed'], color='confirmed',
             labels={'pop':'Confirmed Cases'}, height=400,title='Five worstly hit countries')
fig.update_layout(template='plotly_dark')
fig.show()

#### Q2. What is the comparative situation between china and rest of the world?

In [27]:
data=covid_countrywise.sort_values('confirmed',ascending=False)
row_confirmed=0
row_deaths=0
row_recovered=0
for i in range(1,len(covid_countrywise)):
    row_confirmed+=data.iloc[i]['confirmed']
    row_deaths+=data.iloc[i]['deaths']
    row_recovered+=data.iloc[i]['recovered']

In [28]:
china=[data.iloc[0]['confirmed'],data.iloc[0]['recovered'],data.iloc[0]['deaths']]
rest_of_the_world=[row_confirmed,row_recovered,row_deaths]

In [29]:
#https://plot.ly/python/bar-charts/
import plotly.graph_objects as go
data=covid_countrywise.sort_values('confirmed',ascending=False)[0:10]
country=data['country']

fig = go.Figure()
fig.update_layout(template='plotly_dark')
fig.add_trace(go.Bar(x=['china','Rest of the world'],
                y=[china[0],rest_of_the_world[0]],
                name='Confirmed',
                marker_color='rgb(102, 102, 255)'
                ))
fig.add_trace(go.Bar(x=['china','Rest of the world'],
                y=[china[1],rest_of_the_world[1]],
                name='Recovered',
                marker_color='rgb(0,255,153)'
                ))
fig.add_trace(go.Bar(x=['china','Rest of the world'],
                y=[china[2],rest_of_the_world[2]],
                name='Deaths',
                marker_color='rgb(255, 102, 102)'
                ))
fig.update_layout(
    title='Confirmed/Recovered/Deaths in China and Rest of the World',
    xaxis_tickfont_size=15,
    yaxis=dict(
        title='count',
        titlefont_size=12,
        tickfont_size=15,
       
    ),
    legend=dict(
        x=1,
        y=1.0,
        bgcolor='rgba(255, 255, 255, 0)',
        bordercolor='rgba(255, 255, 255, 0)'
    ),
    barmode='group',
    bargap=0.10, # gap between bars of adjacent location coordinates.
    bargroupgap=0.2 # gap between bars of the same location coordinate.
)
fig.show()

In [30]:
print('The death rate in china is:',str((china[2]/china[0])*100)+' %')
print('The rate of people already recovered in china is(till date):',str((china[1]/china[0])*100)+' %')
print('The death rate in rest of the world is:',str((rest_of_the_world[2]/rest_of_the_world[0])*100)+' %')
print('The rate of people already recovered in rest of the world is(till date):',str((rest_of_the_world[1]/rest_of_the_world[0])*100)+' %')
print('The current overall death rate is:',str(((china[2]+rest_of_the_world[2])/(china[0]+rest_of_the_world[0]))*100)+' %')

The death rate in china is: 3.8064772107325298 %
The rate of people already recovered in china is(till date): 68.78688687199326 %
The death rate in rest of the world is: 1.93517749020539 %
The rate of people already recovered in rest of the world is(till date): 11.425066286754522 %
The current overall death rate is: 3.3600513590317314 %


In [31]:
fig.update_layout(barmode='relative', title_text='China vs Rest of the World Relative Stats',bargap=0.2)
fig.show()

#### Q3. How the confirmed cases are distributed globally over various regions?

In [32]:
fig = px.pie(data, values='confirmed', names='country', title='Distribution of confirmed cases globally')
fig.show()

#### Q4. How confirmed and death cases are distributed over worstly effected countries?

In [33]:
#https://plot.ly/python/pie-charts/
from plotly.subplots import make_subplots
data=covid_countrywise.sort_values('confirmed',ascending=False)[0:5]
labels=list(data['country'])
# Create subplots: use 'domain' type for Pie subplot
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=labels, values=list(data['confirmed']), name='country'),
              1, 1)
fig.add_trace(go.Pie(labels=labels, values=list(data['deaths']), name='country'),
              1, 2)

# Use `hole` to create a donut-like pie chart
fig.update_traces(hole=.4, hoverinfo="label+percent+name")

fig.update_layout(
    title_text="Distribution of confirmed and death cases across worstly hit countries",
    # Add annotations in the center of the donut pies.
    annotations=[dict(text='confirm', x=0.18, y=0.5, font_size=20, showarrow=False),
                 dict(text='Deaths', x=0.82, y=0.5, font_size=20, showarrow=False)])
fig.show()

#### Q5.  Which countries have the worst death rates?

In [34]:
data=covid_countrywise.sort_values('confirmed',ascending=False)
death_confirm_ratio=((data.deaths)/(data.confirmed))*100
data['death_rates']=death_confirm_ratio
data.sort_values('death_rates',ascending=False).head(5)

Unnamed: 0,country,confirmed,deaths,recovered,death_rates
53,unconfirmed/Diamond princes Cruise,1,2,21,200.0
96,Philippines,6,1,1,16.666667
37,Iraq,54,4,0,7.407407
15,San Marino,23,1,0,4.347826
32,Italy,5883,233,589,3.960564


In [75]:
fig = px.bar(data.sort_values('death_rates',ascending=False).head(10), x='country', y='death_rates',
             hover_data=['country','death_rates'], color='country',
             labels={'pop':'Confirmed Cases'}, height=400,title='Countries with worst death rates')
fig.update_layout(template='plotly_dark')
fig.show()

#### Q6. Which countries have the best recovery rates?

In [76]:
data=covid_countrywise.sort_values('confirmed',ascending=False)
recovered_confirm_ratio=((data.recovered)/(data.confirmed))*100
data['recovery_rates']=recovered_confirm_ratio
data.sort_values('recovery_rates',ascending=False).head(5)

Unnamed: 0,country,confirmed,deaths,recovered,recovery_rates
50,Cambodia,1,0,1,100.0
35,Macau,10,0,10,100.0
19,Nepal,1,0,1,100.0
59,Sri Lanka,1,0,1,100.0
85,Vietnam,18,0,16,88.888889


In [80]:
fig = px.bar(data.sort_values('recovery_rates',ascending=False).head(20), x='country', y='recovery_rates',
             hover_data=['country','recovery_rates'], color='country',
             labels={'pop':'Confirmed Cases'}, height=400,title='Countries with best recovery rates')
fig.update_layout(template='plotly_dark')
fig.show()

## Analysis of the outbreak over time

#### Q6. How the virus is spreading over time?

In [260]:
covid_timeseries = covid_master.groupby('ObservationDate')['Confirmed', 'Deaths', 'Recovered'].sum()
covid_timeseries=covid_timeseries.reset_index().sort_values('ObservationDate')
covid_timeseries.head()


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



Unnamed: 0,ObservationDate,Confirmed,Deaths,Recovered
0,01/22/2020,555,17,28
1,01/23/2020,653,18,30
2,01/24/2020,941,26,36
3,01/25/2020,1438,42,39
4,01/26/2020,2118,56,52


In [44]:
#https://plot.ly/python
x=[i.split('/')[0]+'/'+i.split('/')[1] for i in covid_timeseries.ObservationDate ]
fig = go.Figure()
fig.update_layout(template='plotly_dark')
fig.add_trace(go.Scatter(x=x, 
                         y=covid_timeseries['Confirmed'],
                         mode='lines+markers',
                         name='Confirmed',
                         line=dict(color='rgb(102, 102, 255)', width=2)))
fig.add_trace(go.Scatter(x=x, 
                         y=covid_timeseries['Deaths'],
                         mode='lines+markers',
                         name='Deaths',
                         line=dict(color='rgb(255, 102, 102)', width=2)))
fig.add_trace(go.Scatter(x=x, 
                         y=covid_timeseries['Recovered'],
                         mode='lines+markers',
                         name='Recovered',
                         line=dict(color='rgb(0,255,153)', width=2)))
fig.update_layout(
    title = 'Spread of COVID-19 over time',
    xaxis_tickformat = '%d %B (%a)<br>%Y'
)
fig.show()

#### Q7. How the virus is spreading outside ot china?

In [177]:
covid_timeseries = covid_master.groupby(['ObservationDate','Country/Region'])['Confirmed', 'Deaths', 'Recovered'].sum()
covid_timeseries=covid_timeseries.reset_index().sort_values('ObservationDate')
data= covid_timeseries[covid_timeseries['Country/Region']!='Mainland China']
data.head()


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



Unnamed: 0,ObservationDate,Country/Region,Confirmed,Deaths,Recovered
0,01/22/2020,Hong Kong,0,0,0
1,01/22/2020,Japan,2,0,0
2,01/22/2020,Macau,1,0,0
4,01/22/2020,South Korea,1,0,0
5,01/22/2020,Taiwan,1,0,0


In [178]:
fig=go.Figure()
x=[i.split('/')[1]+'/'+i.split('/')[0] for i in data.ObservationDate ]
data['ObservationDate']=x
fig.update_layout(template='plotly_dark')
fig = px.line(data, x="ObservationDate", y="Confirmed", color="Country/Region",
              line_group="Country/Region", hover_name="Country/Region")
fig.update_layout(template='plotly_dark',title_text='spreading of COVID-19 outside China')
fig.show()

#### Q8. How many new incidents (confirmed/ recovered/ Death) are reported on a daily basis?

In [286]:
Newly_reported=[555]
New_deaths=[17]
New_recovered=[28]
covid_confirmed=covid_timeseries.Confirmed
covid_death=covid_timeseries.Deaths
covid_recovered=covid_timeseries.Recovered
for i in range(1,len(covid_confirmed)):
    Newly_reported.append(covid_confirmed[i]-covid_confirmed[i-1])
    New_deaths.append(covid_death[i]-covid_death[i-1])
    New_recovered.append(covid_recovered[i]-covid_recovered[i-1])
covid_timeseries['Newly Confirmed']=Newly_reported 
covid_timeseries['New Death']=New_deaths
covid_timeseries['New Recovered']=New_recovered
covid_timeseries.head()

Unnamed: 0,ObservationDate,Confirmed,Deaths,Recovered,New,Newly Confirmed,New Death,New Recovered
0,22/01,555,17,28,555,555,17,28
1,23/01,653,18,30,98,98,1,2
2,24/01,941,26,36,288,288,8,6
3,25/01,1438,42,39,497,497,16,3
4,26/01,2118,56,52,680,680,14,13


In [288]:
fig=go.Figure()
x=[i.split('/')[0]+'/'+i.split('/')[1] for i in covid_timeseries.ObservationDate ]
fig.add_trace(go.Scatter(x=x, 
                         y=covid_timeseries['Newly Confirmed'],
                         mode='lines',
                         name='New Confirmed Incident',
                         line=dict(color='rgb(102, 102, 255)', width=2)))
fig.add_trace(go.Scatter(x=x,y=covid_timeseries['New Death'],name='New Death Incident',
                        mode='lines',line=dict(color='rgb(255, 102, 102)', width=2)))

fig.add_trace(go.Scatter(x=x,y=covid_timeseries['New Recovered'],name='New Recovery Incident',
             mode='lines',line=dict(color='rgb(0,255,153)', width=2)))
fig.update_layout(
    title = 'New Incident Reported/Recovered/Death per Day',
    xaxis_tickformat = '%d %B (%a)<br>%Y',template='plotly_dark'
)
fig.show()

### Analysis of spread over regions

In [216]:
covid_country=[con.lower() for con in covid_countrywise.country]
covid_country[23]='china'
covid_countrywise.country=covid_country

In [220]:
coordinates=pd.read_csv('world_coordinates.csv')
coordinates=coordinates.rename(columns={'Country':'country'})
cords_country=[con.lower() for con in coordinates.country]
coordinates.country=cords_country
world_data=pd.merge(covid_countrywise,coordinates,on='country')

#### Q9. How the virus is spread across various regions?


In [232]:
# We will create a world map with circles in affected regions
# We will choose the circle radius based on the confirmed ratio
total_confirmed=sum(i for i in covid_countrywise['confirmed'])
total_confirmed_countrywise=world_data.confirmed
percentage_confirmed_per_country=(total_confirmed_countrywise/total_confirmed)*100
for i in range(len(percentage_confirmed_per_country)):
    if(percentage_confirmed_per_country[i]<5):
        percentage_confirmed_per_country[i]=5
    elif(percentage_confirmed_per_country[i]>5 and percentage_confirmed_per_country[i]<25):
        percentage_confirmed_per_country[i]=10
    elif(percentage_confirmed_per_country[i]>25 and percentage_confirmed_per_country[i]<50):
        percentage_confirmed_per_country[i]=15
    else:
         percentage_confirmed_per_country[i]=20
world_data['radius']=percentage_confirmed_per_country

In [233]:
# create map and display it
# How to create map using Folium
#https://python-visualization.github.io/folium/modules.html
# credit: https://www.kaggle.com/parulpandey/wuhan-coronavirus-a-geographical-analysis/data
world_map = folium.Map(location=[30, 0], zoom_start=1.5,tiles='Stamen Toner')
for lat, lon, value, name,confirm_ratio in zip(world_data['latitude'], world_data['longitude'], world_data['confirmed'], world_data['country'],world_data['radius']):
    folium.CircleMarker([lat, lon],
                        radius=confirm_ratio,
                        popup = ('<strong>Country</strong>: ' + str(name).capitalize() + '<br>'
                                '<strong>Confirmed Cases</strong>: ' + str(value) + '<br>'),
                        color='rgb(102, 102, 255)',
                        
                        fill_color='rgb(102, 102, 255)',
                        fill_opacity=0.7 ).add_to(world_map)

In [234]:
world_map

#### Q10. Which are the regious where death cases are reported?

In [256]:
world_map_deaths = folium.Map(location=[30, 0], zoom_start=1.5,tiles='Stamen Toner')
world_data_deaths=world_data[world_data['deaths']>0]
death_confirm_ratio=((world_data.deaths)/(world_data.confirmed))*100
world_data_deaths['death_rates']=death_confirm_ratio
for lat, lon, value, name,rad in zip(world_data_deaths['latitude'], world_data_deaths['longitude'], world_data_deaths['deaths'], world_data_deaths['country'],world_data_deaths['death_rates']):
    folium.CircleMarker([lat, lon],
                        radius=10,
                        popup = ('<strong>Country</strong>: ' + str(name).capitalize() + '<br>'
                                '<strong>Death Cases</strong>: ' + str(value) + '<br>'),
                        color='red',
                        
                        fill_color='red',
                        fill_opacity=0.7 ).add_to(world_map_deaths)

In [258]:
world_map_deaths

#### Q11. Which countries are completely recovered tilll date (All confirmed cases are recovred)?


In [253]:
world_map_recovered = folium.Map(location=[30, 0], zoom_start=1.5,tiles='Stamen Toner')
world_data_totaly_recovered=world_data[world_data['confirmed']==world_data['recovered']]
for lat, lon, value, name in zip(world_data_totaly_recovered['latitude'], world_data_totaly_recovered['longitude'], world_data_totaly_recovered['recovered'], world_data_totaly_recovered['country']):
    folium.CircleMarker([lat, lon],
                        radius=10,
                        popup = ('<strong>Country</strong>: ' + str(name).capitalize() + '<br>'
                                '<strong>Recovered</strong>: ' + str(value) + '<br>'),
                        color='green',
                        
                        fill_color='green',
                        fill_opacity=0.7 ).add_to(world_map_recovered)

In [254]:
world_map_recovered