In [1]:
import pandas as pd

In [2]:
import plotly.express as px

## Loading the data

In [3]:
df = pd.read_csv("group10_macrotable.csv")


## Exploring the data

In [None]:
df.info()

In [21]:
df.head()

Unnamed: 0,week,country_name,population,population_male,population_female,population_age_00_09,population_age_10_19,population_age_20_29,population_age_30_39,population_age_40_49,population_age_50_59,population_age_60_69,population_age_70_79,population_age_80_and_older,life_expectancy,new_confirmed,new_deceased,new_persons_fully_vaccinated,new_deceased_confirmed_ratio
0,2020-05-11/2020-05-17,Germany,82786787.0,39130978.0,40156503.0,7220827.0,7398601.0,9303309.0,10024505.0,9955575.0,12932635.0,9907511.0,7360047.0,5184471.0,,1.0,0.0,0.0,0.0
1,2020-05-11/2020-05-17,Germany,82786787.0,39130978.0,40156503.0,7220827.0,7398601.0,9303309.0,10024505.0,9955575.0,12932635.0,9907511.0,7360047.0,5184471.0,,1.0,0.0,0.0,0.0
2,2020-05-11/2020-05-17,Germany,82786787.0,39130978.0,40156503.0,7220827.0,7398601.0,9303309.0,10024505.0,9955575.0,12932635.0,9907511.0,7360047.0,5184471.0,,2.0,0.0,0.0,0.0
3,2020-05-11/2020-05-17,Germany,82786787.0,39130978.0,40156503.0,7220827.0,7398601.0,9303309.0,10024505.0,9955575.0,12932635.0,9907511.0,7360047.0,5184471.0,,10.0,0.0,0.0,0.0
4,2020-05-11/2020-05-17,Germany,82786787.0,39130978.0,40156503.0,7220827.0,7398601.0,9303309.0,10024505.0,9955575.0,12932635.0,9907511.0,7360047.0,5184471.0,,9.0,0.0,0.0,0.0


In [None]:
df.describe()

This next cell shows the total deaths (related to covid) over the whole dataset timespan

In [4]:
df.groupby(by="country_name").new_deceased.sum()

country_name
Germany           120241.0
Italy                  0.0
Spain              33257.0
United States    1060040.0
Name: new_deceased, dtype: float64

## Graphs


1. What is the trend of new confirmed cases over time for each country?

In [5]:
# Filter data for visualization
fig = px.line(df, x="week", y="new_confirmed", color="country_name",
              title="Weekly Trend of New Confirmed Cases by Country",
              labels={"week": "Week", "new_confirmed": "New Confirmed Cases"},
              template="plotly_white")

# Update x-axis to only show the first and last tick
fig.update_xaxes(
    tickangle=0,   # Align ticks horizontally
    tickvals=[df['week'].min(), df['week'].max()],  # Show only first and last week
    ticktext=[df['week'].min(), df['week'].max()]   # Labels for the ticks
)

fig.update_yaxes(title_text="New Confirmed Cases")

# Show the graph
fig.show()

2. What is the trend of new deceased cases over time for each country?

In [6]:
# Filter data for visualization
fig = px.line(df, x="week", y="new_deceased", color="country_name",
              title="Total Deceased per week and Country",
              labels={"week": "Week", "new_deceased": "New Deceased Cases"},
              template="plotly_white")
fig.update_xaxes(tickangle=45)

fig.update_xaxes(
    tickangle=0,   # Align ticks horizontally
    tickvals=[df['week'].min(), df['week'].max()],  # Show only first and last week
    ticktext=[df['week'].min(), df['week'].max()]   # Labels for the ticks
)

fig.update_yaxes(title_text="Total Deceased")
fig.show()

3. What is the trend of new deceased-confirmed ratio cases over time for each country?

In [7]:
# Calculate new deceased-confirmed ratio
df["new_deceased_confirmed_ratio"] = df["new_deceased"] / df["new_confirmed"]

In [9]:
# plotting the graph
fig = px.line(df, x="week", y="new_deceased_confirmed_ratio", color="country_name",
              title="Weekly Trend of New Deceased-Confirmed Ratio Cases by Country",
              labels={"week": "Week", "new_deceased_confirmed_ratio": "Deceased/Confirmed Cases"},
              template="plotly_white")
fig.update_xaxes(tickangle=45)

fig.update_xaxes(
    tickangle=0,   # Align ticks horizontally
    tickvals=[df['week'].min(), df['week'].max()],  # Show only first and last week
    ticktext=[df['week'].min(), df['week'].max()]   # Labels for the ticks
)

fig.show()



4.What is the trend of new new people fully vaccinated over time for the US?

In [10]:
#Filtering data
a = df['country_name'] == 'United States'
filtered_df = df[a]


# Plot the filtered data
fig = px.line(filtered_df, 
              x="week", 
              y="new_persons_fully_vaccinated", 
              color="country_name",
              title="Weekly Trend of New People Vaccinated in the US ",
              labels={"week": "", "new_persons_fully_vaccinated": "New Vaccinated Cases"},
              template="plotly_white")


# Update x-axis to show only the first and last weeks
first_week = filtered_df['week'].min()
last_week = filtered_df['week'].max()

fig.update_xaxes(
    tickangle=0,
    tickvals=[first_week, last_week],
    ticktext=[f'{first_week} - Week 1',f'{last_week} - Week 115']
)

fig.show()

	

 5. How does the ratio of new deceased to new confirmed cases vary across countries and weeks?5

In [None]:
import plotly.express as px

# Step 1: Define a function to remove outliers using IQR
def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)  # 25th percentile
    Q3 = df[column].quantile(0.75)  # 75th percentile
    IQR = Q3 - Q1  # Interquartile Range
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Step 2: Remove outliers for 'new_deceased_confirmed_ratio' for each country
filtered_df = df.groupby('country_name', group_keys=False).apply(
    lambda x: remove_outliers_iqr(x, 'new_deceased_confirmed_ratio')
)

# Step 3: Exclude 'Italy' from the DataFrame
filtered_df = filtered_df[filtered_df['country_name'] != 'Italy']

# Step 4: Re-plot the data
fig = px.box(filtered_df, x="country_name", y="new_deceased_confirmed_ratio",
             title="Distribution of Deceased-Confirmed Ratio by Country (Outliers Removed)",
             labels={"country_name": "Country", "new_deceased_confirmed_ratio": "Deceased-Confirmed Ratio"},
             template="plotly_white")
fig.update_traces(marker_color="blue")

fig.show()






6. What is the age distribution for each country?

In [25]:
#Age group by country

age_columns = ["population_age_00_09", "population_age_10_19", "population_age_20_29", 
               "population_age_30_39", "population_age_40_49", "population_age_50_59", 
               "population_age_60_69", "population_age_70_79", "population_age_80_and_older"]

df_melted = df.melt(id_vars=["country_name"], value_vars=age_columns, 
                    var_name="age_group", value_name="population_value")

# Calculate percentage share of each age group
df_melted["population_share"] = df_melted["population_value"] / df_melted.groupby("country_name")["population_value"].transform("sum")

# Plot age distribution by country
fig = px.bar(df_melted, x="country_name", y="population_share", color="age_group",
             title="Age Group Distribution by Country",
             labels={"country_name":"Country","population_share": "Population Share", "age_group": "Age Group"},
             template="plotly_white", barmode="stack")
fig.show()

7. How an old population impacts covid deaths?

In [51]:
import plotly.express as px
import pandas as pd

# Calculate 'old_age_share' for each country
df['old_age_share'] = (df['population_age_60_69'] + 
                       df['population_age_70_79'] + 
                       df['population_age_80_and_older']) / df['population']

# Calculate death rate (new_deceased / population)
df['death_rate'] = df['new_deceased'] / df['population']

# Group by country to get average death rate and old age share
country_summary = df.groupby('country_name', as_index=False).agg({
    'death_rate': 'mean',       # Average death rate per country
    'old_age_share': 'mean'     # Average share of old people per country
})

# Create a bar chart
fig = px.bar(country_summary, 
             x='country_name', 
             y='death_rate', 
             color='old_age_share', 
             title='Deaths per Population vs Share of Older Population by Country',
             labels={
                 'death_rate': 'Deaths per Population',
                 'country_name': 'Country',
                 'old_age_share': 'Share of Older Population'
             },
             hover_data=['old_age_share'])

# Improve layout for readability
fig.update_layout(xaxis_tickangle=-45, 
                  xaxis_title='Country', 
                  yaxis_title='Deaths per Population',
                  coloraxis_colorbar=dict(title="Old Age Share"))

# Show chart
fig.show()


8. What is the correlation between Vaccination Rate and New Confirmed Cases?

In [None]:
df_us = df[df['country_name'] == 'United States']

# Sort the DataFrame to ensure correct cumulative calculation
df_us = df_us.sort_values(by=["country_name", "week"])

# Calculate cumulative fully vaccinated persons for each country
df_us["cumulative_vaccinated"] = df_us.groupby("country_name")["new_persons_fully_vaccinated"].cumsum()

# Calculate vaccination rate per day
df_us["vaccination_rate"] = df_us["cumulative_vaccinated"] / df_us["population"]

# Create a scatter plot per day
fig = px.scatter(df_us, 
                 x="vaccination_rate", 
                 y="new_confirmed", 
                 color="country_name",  # Different colors for each country
                 title="Daily Vaccination Rate vs New Confirmed Cases",
                 labels={
                     "vaccination_rate": "Cumulative Vaccination Rate",
                     "new_confirmed": "Daily New Confirmed Cases",
                     "week": "Week"
                 },
                 template="plotly_white", 
                 hover_data=["country_name", "week", "cumulative_vaccinated", "population"])

# Standardize marker appearance
fig.update_traces(marker=dict(size=6, line=dict(width=0.5, color='black')))

# Show the plot
fig.show()

9. What is the Correlation between Weekly Cases and Deaths per country?

In [43]:
# Calculate new confirmed and deceased cases as a percentage of the population
df["new_confirmed_pct"] = (df["new_confirmed"] / df["population"]) * 100
df["new_deceased_pct"] = (df["new_deceased"] / df["population"]) * 100

# Create the scatter plot
fig = px.scatter(df, 
                 x="new_confirmed_pct", 
                 y="new_deceased_pct", 
                 color="country_name",
                 title="New Confirmed vs New Deceased Cases (Population Percentage)",
                 labels={
                     "new_confirmed_pct": "New Confirmed Cases (% of Population)",
                     "new_deceased_pct": "New Deceased Cases (% of Population)"
                 },
                 template="plotly_white",
                 hover_data=["week", "country_name"])

# Standardize marker size and aesthetics
fig.update_traces(marker=dict(size=8, line=dict(width=0.5, color='black')))

# Show the updated plot
fig.show()