In [1]:
import numpy as np
import pandas as pd

### Importing Datasets

In [2]:
summary_data = pd.read_csv('worldometer_coronavirus_summary_data .csv')
daily_data = pd.read_csv('worldometer_coronavirus_daily_data.csv')

In [3]:
summary_data.head(5)

Unnamed: 0,country,continent,total_confirmed,total_deaths,total_recovered,active_cases,serious_or_critical,total_cases_per_1m_population,total_deaths_per_1m_population,total_tests,total_tests_per_1m_population,population
0,Afghanistan,Asia,179267,7690.0,162202.0,9375.0,1124.0,4420,190.0,951337.0,23455.0,40560636
1,Albania,Europe,275574,3497.0,271826.0,251.0,2.0,95954,1218.0,1817530.0,632857.0,2871945
2,Algeria,Africa,265816,6875.0,178371.0,80570.0,6.0,5865,152.0,230861.0,5093.0,45325517
3,Andorra,Europe,42156,153.0,41021.0,982.0,14.0,543983,1974.0,249838.0,3223924.0,77495
4,Angola,Africa,99194,1900.0,97149.0,145.0,,2853,55.0,1499795.0,43136.0,34769277


In [4]:
daily_data.head(5)

Unnamed: 0,date,country,cumulative_total_cases,daily_new_cases,active_cases,cumulative_total_deaths,daily_new_deaths
0,2020-2-15,Afghanistan,0.0,,0.0,0.0,
1,2020-2-16,Afghanistan,0.0,,0.0,0.0,
2,2020-2-17,Afghanistan,0.0,,0.0,0.0,
3,2020-2-18,Afghanistan,0.0,,0.0,0.0,
4,2020-2-19,Afghanistan,0.0,,0.0,0.0,


## Cleaning summary_data

### Checking and droping unwanted columns 

In [5]:
summary_data.columns

Index(['country', 'continent', 'total_confirmed', 'total_deaths',
       'total_recovered', 'active_cases', 'serious_or_critical',
       'total_cases_per_1m_population', 'total_deaths_per_1m_population',
       'total_tests', 'total_tests_per_1m_population', 'population'],
      dtype='object')

In [6]:
summary_data.drop(['serious_or_critical','total_cases_per_1m_population','total_deaths_per_1m_population','total_tests','total_tests_per_1m_population'],axis = 1, inplace = True)

In [7]:
summary_data.head(3)

Unnamed: 0,country,continent,total_confirmed,total_deaths,total_recovered,active_cases,population
0,Afghanistan,Asia,179267,7690.0,162202.0,9375.0,40560636
1,Albania,Europe,275574,3497.0,271826.0,251.0,2871945
2,Algeria,Africa,265816,6875.0,178371.0,80570.0,45325517


### Handling Null Values

In [8]:
summary_data.isnull().sum()

country             0
continent           0
total_confirmed     0
total_deaths        8
total_recovered    22
active_cases       22
population          0
dtype: int64

In [9]:
summary_data.shape

(226, 7)

In [10]:
summary_data.dropna(inplace = True)

### Unique values of columns 

In [11]:
for i in summary_data.columns :
    print(i)
    print(summary_data[i].unique())
    print('\n')  

country
['Afghanistan' 'Albania' 'Algeria' 'Andorra' 'Angola' 'Anguilla'
 'Antigua And Barbuda' 'Argentina' 'Armenia' 'Aruba' 'Australia' 'Austria'
 'Azerbaijan' 'Bahamas' 'Bahrain' 'Bangladesh' 'Barbados' 'Belgium'
 'Belize' 'Benin' 'Bermuda' 'Bhutan' 'Bolivia' 'Botswana' 'Brazil'
 'Brunei Darussalam' 'Bulgaria' 'Burkina Faso' 'Cabo Verde' 'Cambodia'
 'Cameroon' 'Canada' 'Caribbean Netherlands' 'Cayman Islands'
 'Central African Republic' 'Chad' 'Channel Islands' 'Chile' 'China'
 'Colombia' 'Comoros' 'Congo' 'Cook Islands' 'Costa Rica' 'Cote D Ivoire'
 'Croatia' 'Cuba' 'Curacao' 'Cyprus' 'Czech Republic'
 'Democratic Republic Of The Congo' 'Denmark' 'Djibouti' 'Dominica'
 'Dominican Republic' 'Egypt' 'El Salvador' 'Equatorial Guinea' 'Eritrea'
 'Estonia' 'Ethiopia' 'Faeroe Islands' 'Fiji' 'France' 'French Guiana'
 'Gabon' 'Gambia' 'Georgia' 'Germany' 'Ghana' 'Gibraltar' 'Greece'
 'Greenland' 'Grenada' 'Guadeloupe' 'Guatemala' 'Guinea Bissau' 'Guinea'
 'Guyana' 'Haiti' 'Honduras' 'Hung

### Handling Data types of Columns

In [12]:
summary_data.dtypes

country             object
continent           object
total_confirmed      int64
total_deaths       float64
total_recovered    float64
active_cases       float64
population           int64
dtype: object

In [13]:
summary_data['total_deaths'] = summary_data['total_deaths'].apply(np.int64)

In [14]:
summary_data['total_recovered'] = summary_data['total_recovered'].apply(np.int64)

In [15]:
summary_data['active_cases'] = summary_data['active_cases'].apply(np.int64)

In [16]:
summary_data.dtypes

country            object
continent          object
total_confirmed     int64
total_deaths        int64
total_recovered     int64
active_cases        int64
population          int64
dtype: object

In [17]:
summary_data.head(3)

Unnamed: 0,country,continent,total_confirmed,total_deaths,total_recovered,active_cases,population
0,Afghanistan,Asia,179267,7690,162202,9375,40560636
1,Albania,Europe,275574,3497,271826,251,2871945
2,Algeria,Africa,265816,6875,178371,80570,45325517


## Cleaning daily_data

In [18]:
daily_data.head(3)

Unnamed: 0,date,country,cumulative_total_cases,daily_new_cases,active_cases,cumulative_total_deaths,daily_new_deaths
0,2020-2-15,Afghanistan,0.0,,0.0,0.0,
1,2020-2-16,Afghanistan,0.0,,0.0,0.0,
2,2020-2-17,Afghanistan,0.0,,0.0,0.0,


### Checking and droping unwanted columns

In [19]:
daily_data.columns

Index(['date', 'country', 'cumulative_total_cases', 'daily_new_cases',
       'active_cases', 'cumulative_total_deaths', 'daily_new_deaths'],
      dtype='object')

In [20]:
daily_data.drop(['cumulative_total_deaths'],axis = 1 , inplace = True)

In [21]:
daily_data.head(5)

Unnamed: 0,date,country,cumulative_total_cases,daily_new_cases,active_cases,daily_new_deaths
0,2020-2-15,Afghanistan,0.0,,0.0,
1,2020-2-16,Afghanistan,0.0,,0.0,
2,2020-2-17,Afghanistan,0.0,,0.0,
3,2020-2-18,Afghanistan,0.0,,0.0,
4,2020-2-19,Afghanistan,0.0,,0.0,


###   Checking and Handling null values

In [22]:
daily_data.isnull().sum()

date                          0
country                       0
cumulative_total_cases        0
daily_new_cases           10458
active_cases              18040
daily_new_deaths          26937
dtype: int64

In [23]:
daily_data.shape

(184787, 6)

In [24]:
daily_data.dropna(inplace = True)

In [25]:
daily_data.isnull().sum()

date                      0
country                   0
cumulative_total_cases    0
daily_new_cases           0
active_cases              0
daily_new_deaths          0
dtype: int64

In [38]:
daily_data.shape

(142034, 6)

### Unique Values of columns

In [26]:
for i in daily_data.columns :
    print(i)
    print(daily_data[i].unique())
    print('\n')  

date
['2020-2-25' '2020-2-26' '2020-2-27' '2020-2-28' '2020-2-29' '2020-3-01'
 '2020-3-02' '2020-3-03' '2020-3-04' '2020-3-05' '2020-3-06' '2020-3-07'
 '2020-3-08' '2020-3-09' '2020-3-10' '2020-3-11' '2020-3-12' '2020-3-13'
 '2020-3-14' '2020-3-15' '2020-3-16' '2020-3-17' '2020-3-18' '2020-3-19'
 '2020-3-20' '2020-3-21' '2020-3-22' '2020-3-23' '2020-3-24' '2020-3-25'
 '2020-3-26' '2020-3-27' '2020-3-28' '2020-3-29' '2020-3-30' '2020-3-31'
 '2020-4-01' '2020-4-02' '2020-4-03' '2020-4-04' '2020-4-05' '2020-4-06'
 '2020-4-07' '2020-4-08' '2020-4-09' '2020-4-10' '2020-4-11' '2020-4-12'
 '2020-4-13' '2020-4-14' '2020-4-15' '2020-4-16' '2020-4-17' '2020-4-18'
 '2020-4-19' '2020-4-20' '2020-4-21' '2020-4-22' '2020-4-23' '2020-4-24'
 '2020-4-25' '2020-4-26' '2020-4-27' '2020-4-28' '2020-4-29' '2020-4-30'
 '2020-5-01' '2020-5-02' '2020-5-03' '2020-5-04' '2020-5-05' '2020-5-06'
 '2020-5-07' '2020-5-08' '2020-5-09' '2020-5-10' '2020-5-11' '2020-5-12'
 '2020-5-13' '2020-5-14' '2020-5-15' '2020-5-1

### Handling Datatypes of columns

In [27]:
daily_data.dtypes

date                       object
country                    object
cumulative_total_cases    float64
daily_new_cases           float64
active_cases              float64
daily_new_deaths          float64
dtype: object

In [28]:
daily_data['cumulative_total_cases'] = daily_data['cumulative_total_cases'].apply(np.int64)

In [29]:
daily_data['active_cases'] = daily_data['active_cases'].apply(np.int64)

In [34]:
daily_data['daily_new_cases'] = daily_data['daily_new_cases'].apply(np.int64)

In [35]:
daily_data['daily_new_deaths'] = daily_data['daily_new_deaths'].apply(np.int64)

In [36]:
daily_data.dtypes

date                      object
country                   object
cumulative_total_cases     int64
daily_new_cases            int64
active_cases               int64
daily_new_deaths           int64
dtype: object

In [31]:
daily_data.head(5)

Unnamed: 0,date,country,cumulative_total_cases,daily_new_cases,active_cases,daily_new_deaths
10,2020-2-25,Afghanistan,1,0.0,1,0.0
11,2020-2-26,Afghanistan,1,0.0,1,0.0
12,2020-2-27,Afghanistan,1,0.0,1,0.0
13,2020-2-28,Afghanistan,1,0.0,1,0.0
14,2020-2-29,Afghanistan,1,0.0,1,0.0


### Saving the cleaned data into csv

In [32]:
summary_data.to_csv("clean_summary_data.csv",index=False)

In [37]:
daily_data.to_csv("clean_daily_data2.csv",index=False)