<a href="https://colab.research.google.com/github/afrokyss/dvp-u3-constellations/blob/master/spread_covid_in_africa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# African covid spread predictions in Africa **Project**

![Texte alternatifâ€¦](https://www.iapb.org/wp-content/uploads/covid-19-french-900x511-1.jpg)

#### In this project we will try to create a model to predict the diffusion of `Covid-19 in Africa`. The collected data and predictions will be integrated into a web application.

### The data used will come from the `Oxford University` website [ourworldindata.org](https://github.com/owid/covid-19-data/blob/master/public/data/owid-covid-data.csv) as `main dataframe` and from [John Hopkins University](https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv) for `recovered cases`.

### We plan to eventually include data sets from other sources in order to add features. 


### Preprocessing and datasets transformation

In [None]:
# import usual labrairies 

import pandas as pd
import datetime 


# import vizual librairies
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import folium 
from folium import plugins



# manipulating the default plot size
plt.rcParams['figure.figsize']=10, 12

# disable warning
import warnings
warnings.filterwarnings('ignore')



### Load datasets

In [None]:
# load our datasets

url = 'https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv'
url_recovered = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv'

corona_word_df = pd.read_csv(url)
df_recovered = pd.read_csv(url_recovered)



I already created a dataset with the 5 regions of Africa with the `Oxford` dataset
'/content/data/african_regions.csv'

In [None]:
df_regions = pd.read_csv('/content/data/african_regions.csv')

## preprocess 1

In [None]:
corona_word_df.head()

In [None]:
# let's keep only african countries
africa_covid_df = corona_word_df[corona_word_df['continent']=='Africa']

#merge regions in africa dataset
africa_covid_df = pd.merge(africa_covid_df, df_regions, how='inner', on=['location'])

#reorder new column function
def reorder_col(col_1, col_2, df):
  old_columns = [x for x in df.columns if x not in [col_1]]
  insert_index = old_columns.index(col_2)+1
  df = df[old_columns[:insert_index] + [col_1] + old_columns[insert_index:]]
  return df

africa_covid_df = reorder_col('region', 'continent', africa_covid_df)

#drop continent
africa_covid_df = africa_covid_df.drop(columns=['continent'], axis= 1)




In [None]:
africa_covid_df = africa_covid_df.sort_values(['location', 'date'])

In [None]:

#drop unless date rows to start just before the first case in africa
africa_covid_df = africa_covid_df[~(africa_covid_df['date'].isin(['2019-12-31', '2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04', '2020-01-05', '2020-01-06', '2020-01-07', '2020-01-08', '2020-01-09', '2020-01-10', '2020-01-11', '2020-01-12', '2020-01-13', '2020-01-14', '2020-01-15', '2020-01-16', '2020-01-17', '2020-01-18', '2020-01-19', '2020-01-20', '2020-01-21', '2020-01-22', '2020-01-23', '2020-01-25', '2020-01-24', '2020-01-26', '2020-01-27', '2020-01-28', '2020-01-29', '2020-01-30', '2020-01-31','2020-02-01', '2020-02-02', '2020-02-03', '2020-02-04',
       '2020-02-05', '2020-02-06', '2020-02-07', '2020-02-08',
       '2020-02-09', '2020-02-10']))]


#### I add the positive rate based on new_tests and new cases. For the moment this column is not very useful because a lot of data on testing in Africa is missing. It will be necessary to enrich the dataset as we go along.

In [None]:
#calculate the contamination rate with new_tests and new_tests
africa_covid_df['positive_rate'] = round((africa_covid_df['new_cases']/africa_covid_df['new_tests'])*100, 3)

In [None]:
#reorder new column. let's make a function for that
africa_covid_df = reorder_col('positive_rate', 'new_deaths', africa_covid_df)

In [None]:
#creat csv file
africa_covid_df.to_csv('africa_covid_01.csv', index_label=False)

# Preprocess II

In [None]:
#let's inspect the recovered cases from the other dataframe

df_recovered.head()


In [None]:
# drop unless column
df_recovered = df_recovered.drop(columns=['Province/State'])

# rename column
df_recovered = df_recovered.rename(columns={'Country/Region':'location'})

#reshape dataframe
df_recovered = df_recovered.melt(id_vars=['location', 'Lat', 'Long'], var_name='date', value_name='recovered')

#convert date format
df_recovered = df_recovered.assign(date = pd.to_datetime(df_recovered['date'], format='%m/%d/%y'))



In [None]:
df_recovered.head()

##### drop unless date rows to start just before the first case in africa

In [None]:
#drop unless date rows to start just before the first case in africa
df_recovered = df_recovered[~(df_recovered['date'].isin(['2020-01-22T00:00:00.000000000', '2020-01-23T00:00:00.000000000',
       '2020-01-24T00:00:00.000000000', '2020-01-25T00:00:00.000000000',
       '2020-01-26T00:00:00.000000000', '2020-01-27T00:00:00.000000000',
       '2020-01-28T00:00:00.000000000', '2020-01-29T00:00:00.000000000',
       '2020-01-30T00:00:00.000000000', '2020-01-31T00:00:00.000000000',
       '2020-02-01T00:00:00.000000000', '2020-02-02T00:00:00.000000000',
       '2020-02-03T00:00:00.000000000', '2020-02-04T00:00:00.000000000',
       '2020-02-05T00:00:00.000000000', '2020-02-06T00:00:00.000000000',
       '2020-02-07T00:00:00.000000000', '2020-02-08T00:00:00.000000000',
       '2020-02-09T00:00:00.000000000', '2020-02-10T00:00:00.000000000'
                                                         
]))]

In [None]:
#remove non african contries

df_recovered = df_recovered.loc[~(df_recovered['location'].isin(['Afghanistan', 'Albania', 'Andorra', 
                                                                 'Antigua and Barbuda', 'Argentina', 'Armenia', 
                                                                 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 
                                                                 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 
                                                                 'Belgium', 'Belize', 'Bhutan', 'Bolivia', 
                                                                 'Bosnia and Herzegovina', 'Brazil', 
                                                                 'Brunei', 'Bulgaria', 'Burma', 'Cambodia', 
                                                                 'Canada', 'Chile', 'China', 'Colombia', 
                                                                 'Costa Rica', 'Croatia', 'Cuba', 'Cyprus', 
                                                                 'Czechia', 'Denmark', 'Diamond Princess', 
                                                                 'Dominica', 'Dominican Republic', 'Ecuador', 
                                                                 'El Salvador', 'Estonia', 'Fiji', 
                                                                 'Finland', 'France', 'Georgia', 'Germany', 
                                                                 'Greece', 'Grenada', 'Guatemala', 'Guyana', 
                                                                 'Haiti', 'Holy See', 'Honduras', 'Hungary', 
                                                                 'Iceland', 'India', 'Indonesia', 'Iran', 'Iraq',
                                                                 'Ireland', 'Israel', 'Italy', 'Jamaica', 'Japan',
                                                                 'Jordan', 'Kazakhstan', 'Kosovo', 'Kuwait','Kyrgyzstan', 'Laos', 'Latvia', 'Lebanon', 'Liechtenstein', 'Lithuania', 'Luxembourg', 'MS Zaandam', 'Malaysia', 'Maldives', 'Malta', 'Mexico', 'Moldova', 'Monaco', 'Mongolia', 'Montenegro', 'Nepal', 'Netherlands', 'New Zealand', 'Nicaragua', 'North Macedonia', 'Norway', 'Oman', 'Pakistan', 'Panama', 'Papua New Guinea', 'Paraguay', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Qatar', 'Romania', 'Russia', 'Saint Kitts and Nevis', 'Saint Lucia', 'Saint Vincent and the Grenadines', 'San Marino', 'Saudi Arabia', 'Serbia', 'Singapore', 'Slovakia', 'Slovenia', 'South Korea', 'Spain', 'Sri Lanka', 'Suriname', 'Sweden', 'Switzerland', 'Syria', 'Taiwan', 'Tajikistan', 'Thailand', 'Timor-Leste', 'Trinidad and Tobago', 'Turkey', 'US', 'Ukraine', 'United Arab Emirates', 'United Kingdom', 'Uruguay', 'Uzbekistan', 'Venezuela', 'Vietnam', 'West Bank and Gaza', 'Yemen']))]


In [None]:
df_recovered.head()

In [None]:
#rename some values with countries name diff from other dataset

df_recovered['location'] = df_recovered['location'].replace('Cabo Verde', 'Cape Verde')
df_recovered['location'] = df_recovered['location'].replace('Congo (Brazzaville)', 'Congo')
df_recovered['location'] = df_recovered['location'].replace('Congo (Kinshasa)', 'Democratic Republic of Congo')
df_recovered['location'] = df_recovered['location'].replace('Eswatini', 'Swaziland')

In [None]:
#merge with african regions
df_recovered = pd.merge(df_recovered, df_regions, how='inner', on=['location'])

In [None]:
#create new_recovered column
df_recovered_all_new = (df_recovered
                        .sort_values(by=['date', 'location'])
                        .filter(['location','Lat', 'Long' 'date', 'recovered','region'])
                        .groupby('location')
                        .recovered
                        .diff()
                    )
df_recovered = df_recovered.assign(new_recovered = df_recovered_all_new)

#reorder col
df_recovered = reorder_col('region', 'location', df_recovered)

In [None]:
df_recovered.head()

In [None]:
#create a csv
df_recovered.to_csv('africa_recovered.csv', index_label=False)

# Merge de datasets

In [None]:
# convert date to object format
df_recovered['date'] = df_recovered['date'].astype(str)

In [None]:
#merge the datasets
df_covid_merged = (africa_covid_df.merge(df_recovered, on = ['location','region','date'], how = 'left'))


In [None]:
#reorder cols
old_columns = [x for x in df_covid_merged.columns if x not in ['Lat', 'Long']]
insert_index = old_columns.index('iso_code') + 1
df_covid_merged = df_covid_merged[old_columns[:insert_index] + ['Lat', 'Long'] + old_columns[insert_index:]]

old_columns = [x for x in df_covid_merged.columns if x not in ['recovered', 'new_recovered']]
insert_index = old_columns.index('new_deaths') + 1
df_covid_merged = df_covid_merged[old_columns[:insert_index] + ['recovered', 'new_recovered'] + old_columns[insert_index:]]

#### Let's add a positive rate col

In [None]:
# let's finish this first part by adding active cases

df_covid_merged['active_cases'] = df_covid_merged['total_cases'] - df_covid_merged['total_deaths'] - df_covid_merged['recovered'] 

In [None]:
# reorder col
df_covid_merged = reorder_col('active_cases', 'date', df_covid_merged)

In [None]:
df_covid_merged.head()

In [None]:
#convert date
df_covid_merged = df_covid_merged.assign(date = pd.to_datetime(df_covid_merged['date']))


In [None]:
df_covid_merged.head()

In [None]:
df_covid_merged.to_csv('african_covid_spread.csv', index_label=False)