# **Covid Data Processing**
#### Kelompok 9 Rekayasa Data
---

# **Data Extracting**


In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

df = pd.DataFrame()

for i in range (22, 30):
  if i < 10:
    day = '0' + str(i)
  else:
    day = str (i)

  url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/11-' + day + '-2022.csv'
  df_to_append = pd.read_csv(url)

  df = df.append(df_to_append)

print('Append success')

Append success


In [2]:
df = df.sort_values(['Country_Region', 'Province_State','Last_Update'])
df

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,Incident_Rate,Case_Fatality_Ratio
0,,,,Afghanistan,2022-11-23 04:21:06,33.939110,67.709953,205324,7833,,,Afghanistan,527.440920,3.814946
0,,,,Afghanistan,2022-11-24 04:21:09,33.939110,67.709953,205391,7833,,,Afghanistan,527.613031,3.813702
0,,,,Afghanistan,2022-11-25 04:21:06,33.939110,67.709953,205506,7833,,,Afghanistan,527.908446,3.811568
0,,,,Afghanistan,2022-11-26 04:21:11,33.939110,67.709953,205541,7833,,,Afghanistan,527.998355,3.810919
0,,,,Afghanistan,2022-11-27 04:21:05,33.939110,67.709953,205612,7833,,,Afghanistan,528.180741,3.809603
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4015,,,,Zimbabwe,2022-11-26 04:21:11,-19.015438,29.154857,257893,5606,,,Zimbabwe,1735.142748,2.173770
4015,,,,Zimbabwe,2022-11-27 04:21:05,-19.015438,29.154857,257893,5606,,,Zimbabwe,1735.142748,2.173770
4015,,,,Zimbabwe,2022-11-28 04:21:02,-19.015438,29.154857,257893,5606,,,Zimbabwe,1735.142748,2.173770
4015,,,,Zimbabwe,2022-11-29 04:21:05,-19.015438,29.154857,257893,5606,,,Zimbabwe,1735.142748,2.173770


In [3]:
#Karena kami hanya membutuhkan lima atribut yaitu Last_Update, Province_State, Country_Region, Confirmed, dan Deaths
#maka kami membuat dataframe baru yang hanya mengandung 5 atribut tersebut
df["Last_Update"] = pd.to_datetime(df["Last_Update"]).dt.strftime('%Y-%m-%d')
df2 = df[['Last_Update', 'Province_State','Country_Region', 'Confirmed', 'Deaths']].copy()
df2

Unnamed: 0,Last_Update,Province_State,Country_Region,Confirmed,Deaths
0,2022-11-23,,Afghanistan,205324,7833
0,2022-11-24,,Afghanistan,205391,7833
0,2022-11-25,,Afghanistan,205506,7833
0,2022-11-26,,Afghanistan,205541,7833
0,2022-11-27,,Afghanistan,205612,7833
...,...,...,...,...,...
4015,2022-11-26,,Zimbabwe,257893,5606
4015,2022-11-27,,Zimbabwe,257893,5606
4015,2022-11-28,,Zimbabwe,257893,5606
4015,2022-11-29,,Zimbabwe,257893,5606


#**Data Cleaning**

###**Handle missing values**

In [4]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32128 entries, 0 to 4015
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Last_Update     32128 non-null  object
 1   Province_State  30696 non-null  object
 2   Country_Region  32128 non-null  object
 3   Confirmed       32128 non-null  int64 
 4   Deaths          32128 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 1.5+ MB


In [5]:
df2.isnull().sum()

Last_Update          0
Province_State    1432
Country_Region       0
Confirmed            0
Deaths               0
dtype: int64

In [6]:
#Terlihat bahwa attribut yang memiliki missing values adalah atribut Province_State
#Sehingga, untuk mengatasi hal ini, akan dilakukan pengisian missing values dengan mean dari atribut yang bersangkutan

df2['Province_State'] = df2['Province_State'].fillna('Unknown')

In [7]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32128 entries, 0 to 4015
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Last_Update     32128 non-null  object
 1   Province_State  32128 non-null  object
 2   Country_Region  32128 non-null  object
 3   Confirmed       32128 non-null  int64 
 4   Deaths          32128 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 1.5+ MB


# **Data Transform**

In [8]:
#Kolom baru untuk mendapatkan kasus terkonfirmasi dan kasus meninggal perhari
for data in range(0,len(df2)):
  df2['Confirmed_perday'] = df2['Confirmed'] - df2['Confirmed'].shift()
  df2['Deaths_perday'] = df2['Deaths'] - df2['Deaths'].shift()

In [9]:
df2

Unnamed: 0,Last_Update,Province_State,Country_Region,Confirmed,Deaths,Confirmed_perday,Deaths_perday
0,2022-11-23,Unknown,Afghanistan,205324,7833,,
0,2022-11-24,Unknown,Afghanistan,205391,7833,67.0,0.0
0,2022-11-25,Unknown,Afghanistan,205506,7833,115.0,0.0
0,2022-11-26,Unknown,Afghanistan,205541,7833,35.0,0.0
0,2022-11-27,Unknown,Afghanistan,205612,7833,71.0,0.0
...,...,...,...,...,...,...,...
4015,2022-11-26,Unknown,Zimbabwe,257893,5606,0.0,0.0
4015,2022-11-27,Unknown,Zimbabwe,257893,5606,0.0,0.0
4015,2022-11-28,Unknown,Zimbabwe,257893,5606,0.0,0.0
4015,2022-11-29,Unknown,Zimbabwe,257893,5606,0.0,0.0


# **Download Data**

In [10]:
from google.colab import files

df2.to_csv('covid-19_cases_global.csv')
files.download('covid-19_cases_global.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>