In [2]:
#Import Libraries 
import numpy as np
import pandas as pd

### 1. Loading the Dataset

In [3]:
df = pd.read_csv('owid-covid-data.csv')
df.head()

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,2020-01-05,0.0,0.0,,0.0,0.0,,...,,37.75,0.5,64.83,0.51,41128772,,,,
1,AFG,Asia,Afghanistan,2020-01-06,0.0,0.0,,0.0,0.0,,...,,37.75,0.5,64.83,0.51,41128772,,,,
2,AFG,Asia,Afghanistan,2020-01-07,0.0,0.0,,0.0,0.0,,...,,37.75,0.5,64.83,0.51,41128772,,,,
3,AFG,Asia,Afghanistan,2020-01-08,0.0,0.0,,0.0,0.0,,...,,37.75,0.5,64.83,0.51,41128772,,,,
4,AFG,Asia,Afghanistan,2020-01-09,0.0,0.0,,0.0,0.0,,...,,37.75,0.5,64.83,0.51,41128772,,,,


In [6]:
columns_to_keep = [
    'location', 'date', 'total_cases', 'new_cases',
    'total_deaths', 'people_vaccinated', 'population'
]
df = df[columns_to_keep]

In [7]:
df.head()

Unnamed: 0,location,date,total_cases,new_cases,total_deaths,people_vaccinated,population
0,Afghanistan,2020-01-05,0.0,0.0,0.0,,41128772
1,Afghanistan,2020-01-06,0.0,0.0,0.0,,41128772
2,Afghanistan,2020-01-07,0.0,0.0,0.0,,41128772
3,Afghanistan,2020-01-08,0.0,0.0,0.0,,41128772
4,Afghanistan,2020-01-09,0.0,0.0,0.0,,41128772


### 2. Data Preprocessing

In [8]:
df.isna().sum()

location                  0
date                      0
total_cases           17631
new_cases             19276
total_deaths          17631
people_vaccinated    348303
population                0
dtype: int64

In [10]:
# Remove data for continents and world (keep countries only)
exclude = ['Africa', 'Asia', 'Europe', 'European Union', 'High income', 'World', 'North America', 'South America', 'Oceania']
df = df[~df['location'].isin(exclude)]

In [11]:
# Handle missing values
df.fillna(0, inplace=True)

# Convert date to datetime
df['date'] = pd.to_datetime(df['date'])

In [13]:
df.isna().sum()

location             0
date                 0
total_cases          0
new_cases            0
total_deaths         0
people_vaccinated    0
population           0
dtype: int64

### 3. Saving Cleaned Data

In [14]:
# Export cleaned data
df.to_csv('cleaned_covid_data.csv', index=False)

In [15]:
df.shape

(417680, 7)

In [2]:
import pandas as pd

# Load all datasets
national_df = pd.read_csv("case_time_series.csv")
states_df = pd.read_csv("states.csv")
state_daily_df = pd.read_csv("state_wise_daily.csv")
vaccine_df = pd.read_csv("cowin_vaccine_data_statewise.csv")
district_df = pd.read_csv("districts.csv")

# Merge by concatenating columns side by side
# (reset index to avoid misalignment)

# Step 1: Reset index for all
national_df = national_df.reset_index(drop=True)
states_df = states_df.reset_index(drop=True)
state_daily_df = state_daily_df.reset_index(drop=True)
vaccine_df = vaccine_df.reset_index(drop=True)
district_df = district_df.reset_index(drop=True)

# Step 2: Concatenate all columns horizontally
merged_df = pd.concat([national_df, states_df, state_daily_df, vaccine_df, district_df], axis=1)

# Preview
print(merged_df.head())

# Save merged file
merged_df.to_csv("covid_india.csv", index=False)

              Date    Date_YMD  Daily Confirmed  Total Confirmed  \
0  30 January 2020  2020-01-30              1.0              1.0   
1  31 January 2020  2020-01-31              0.0              1.0   
2  1 February 2020  2020-02-01              0.0              1.0   
3  2 February 2020  2020-02-02              1.0              2.0   
4  3 February 2020  2020-02-03              1.0              3.0   

   Daily Recovered  Total Recovered  Daily Deceased  Total Deceased  \
0              0.0              0.0             0.0             0.0   
1              0.0              0.0             0.0             0.0   
2              0.0              0.0             0.0             0.0   
3              0.0              0.0             0.0             0.0   
4              0.0              0.0             0.0             0.0   

         Date   State  ...  Female (Individuals Vaccinated)  \
0  2020-01-30   India  ...                              0.0   
1  2020-01-30  Kerala  ...            