In [4]:
import pandas as pd

In [12]:
df = pd.read_csv("covid_19_country_wise_latest.csv")
df

Unnamed: 0.1,Unnamed: 0,Country/Region,Confirmed,Deaths,Recovered,Active,New cases,New deaths,New recovered,Deaths / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered,Confirmed last week,1 week change,1 week % increase,WHO Region
0,0,Afghanistan,36263,1269,25198,9796,106.0,10.0,18,3.50,69.49,5.04,35526,737,2.07,Eastern Mediterranean
1,1,Albania,4880,144,2745,1991,117.0,6.0,63,2.95,56.25,5.25,4171,709,17.00,Europe
2,2,Algeria,27973,1163,18837,7973,616.0,8.0,749,4.16,67.34,6.17,23691,4282,18.07,Africa
3,3,Andorra,907,52,803,52,10.0,0.0,0,5.73,88.53,6.48,884,23,2.60,Europe
4,4,Angola,950,41,242,667,18.0,1.0,0,4.32,25.47,16.94,749,201,26.84,Africa
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182,182,West Bank and Gaza,10621,78,3752,6791,152.0,2.0,0,0.73,35.33,2.08,8916,1705,19.12,Eastern Mediterranean
183,183,Western Sahara,10,1,8,1,0.0,0.0,0,10.00,80.00,12.50,10,0,0.00,Africa
184,184,Yemen,1691,483,833,375,10.0,4.0,36,28.56,49.26,57.98,1619,72,4.45,Eastern Mediterranean
185,185,Zambia,4552,140,2815,1597,71.0,1.0,465,3.08,61.84,4.97,3326,1226,36.86,Africa


In [13]:
# Display first 5 rows
print("First 5 rows:")
print(df.head())

First 5 rows:
   Unnamed: 0 Country/Region  Confirmed Deaths  Recovered  Active  New cases  \
0           0    Afghanistan      36263   1269      25198    9796      106.0   
1           1        Albania       4880    144       2745    1991      117.0   
2           2        Algeria      27973   1163      18837    7973      616.0   
3           3        Andorra        907     52        803      52       10.0   
4           4         Angola        950     41        242     667       18.0   

   New deaths  New recovered  Deaths / 100 Cases  Recovered / 100 Cases  \
0        10.0             18                3.50                  69.49   
1         6.0             63                2.95                  56.25   
2         8.0            749                4.16                  67.34   
3         0.0              0                5.73                  88.53   
4         1.0              0                4.32                  25.47   

   Deaths / 100 Recovered  Confirmed last week  1 week

In [14]:
# Set index
df = df.set_index(df.columns[0])
df.index.name = "index"

In [18]:
# Function to replace whitespace with underscore in column names
def clean_column_names(dataframe):
    dataframe.columns = dataframe.columns.str.replace(" ", "_")
    return dataframe

df = clean_column_names(df)

In [19]:
# Check basic information
print("\nDataset Info:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
Index: 187 entries, 0 to 186
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Country/Region          187 non-null    object 
 1   Confirmed               187 non-null    int64  
 2   Deaths                  187 non-null    object 
 3   Recovered               187 non-null    int64  
 4   Active                  187 non-null    int64  
 5   New_cases               177 non-null    float64
 6   New_deaths              177 non-null    float64
 7   New_recovered           187 non-null    int64  
 8   Deaths_/_100_Cases      187 non-null    float64
 9   Recovered_/_100_Cases   187 non-null    float64
 10  Deaths_/_100_Recovered  187 non-null    float64
 11  Confirmed_last_week     187 non-null    int64  
 12  1_week_change           187 non-null    int64  
 13  1_week_%_increase       187 non-null    float64
 14  WHO_Region              187 non-

In [44]:
df.fillna(df.select_dtypes(include=[np.number]).mean(), inplace=True)


In [22]:

# Count unique countries
num_countries = df.shape[0]
print(f"\nNumber of unique countries: {num_countries}")


Number of unique countries: 187


In [24]:
# Check for duplicate countries and remove if needed
df = df[~df.index.duplicated(keep='first')]

In [25]:
# Calculate mean, median, and standard deviation of total cases
print("\nStatistics for Total Cases:")
print(f"Mean: {df['Confirmed'].mean()}")
print(f"Median: {df['Confirmed'].median()}")
print(f"Standard Deviation: {df['Confirmed'].std()}")


Statistics for Total Cases:
Mean: 88130.935828877
Median: 5059.0
Standard Deviation: 383318.66383061546


In [26]:
# Replace strings in Deaths column with mean
df['Deaths'] = pd.to_numeric(df['Deaths'], errors='coerce')
df['Deaths'].fillna(df['Deaths'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Deaths'].fillna(df['Deaths'].mean(), inplace=True)


In [28]:
# Change datatype of Deaths column
df['Deaths'] = df['Deaths'].astype(int)

In [29]:
# Total Deaths and Recoveries worldwide
total_deaths = df['Deaths'].sum()
total_recovered = df['Recovered'].sum()
print(f"\nTotal Deaths: {total_deaths}")
print(f"Total Recovered: {total_recovered}")


Total Deaths: 657516
Total Recovered: 9468087


In [30]:
# Countries with more than 1 million cases
high_case_countries = df[df['Confirmed'] > 1_000_000]
print(f"\nCountries with more than 1M cases: {high_case_countries.index.tolist()}")



Countries with more than 1M cases: [23, 79, 173]


In [31]:
# Countries with recovery rate > 95%
df['Recovery_Rate'] = (df['Recovered'] / df['Confirmed']) * 100
high_recovery_countries = df[df['Recovery_Rate'] > 95]
print(f"\nCountries with recovery rate above 95%: {high_recovery_countries.index.tolist()}")


Countries with recovery rate above 95%: [24, 48, 49, 69, 75, 78, 105, 110, 121, 126, 136, 164]


In [32]:
# Drop WHO Region and Confirmed columns
df.drop(columns=['WHO_Region', 'Confirmed'], inplace=True)

In [33]:
# Country with max deaths
max_death_country = df['Deaths'].idxmax()
print(f"\nCountry with maximum deaths: {max_death_country}")



Country with maximum deaths: 173


In [34]:
# Sort countries by deaths
df_sorted = df.sort_values(by='Deaths', ascending=False)
print("\nTop 5 countries by Deaths:")
print(df_sorted.head())


Top 5 countries by Deaths:
       Country/Region  Deaths  Recovered   Active  New_cases  New_deaths  \
index                                                                      
173                US  148011    1325804  2816444    56336.0      1076.0   
23             Brazil   87618    1846641   508116    23284.0       614.0   
177    United Kingdom   45844       1437   254427      688.0         7.0   
111            Mexico   44022     303810    47657     4973.0       342.0   
85              Italy   35112     198593    12581      168.0         5.0   

       New_recovered  Deaths_/_100_Cases  Recovered_/_100_Cases  \
index                                                             
173            27941                3.45                  30.90   
23             33728                3.59                  75.61   
177                3               15.19                   0.48   
111             8588               11.13                  76.82   
85               147               14

In [35]:
# Create Total_cases column
df['Total_Cases'] = df['Deaths'] + df['Recovered'] + df['Active']


In [36]:
# Calculate Death_Rate
df['Death_Rate'] = (df['Deaths'] / df['Total_Cases']) * 100

In [43]:
low_death_rate_countries = df[(df['Total_Cases'] > 1_000_000) & (df['Death_Rate'] < 2)]
print("\nCountries with increasing cases but low death rate:")
print(low_death_rate_countries.index.tolist())




Countries with increasing cases but low death rate:
[]


In [39]:
# Print head of specific columns
print("\nHead of Country and Death Rate columns:")
print(df[['Country/Region', 'Death_Rate']].head())


Head of Country and Death Rate columns:
      Country/Region  Death_Rate
index                           
0        Afghanistan    3.499435
1            Albania    2.950820
2            Algeria    4.157581
3            Andorra    5.733186
4             Angola    4.315789
