## EDA

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load data
data_path = '../data/plane_data_results/'
df_start = pd.read_csv(data_path + 'start_country_us_flight_count.csv')
df_end = pd.read_csv(data_path + 'end_country_us_flight_count.csv')
df_enter = pd.read_csv(data_path + 'enter_country_us_flight_count.csv')
covid_us = pd.read_csv('../data/covid_data/time_series_covid19_confirmed_US.csv')
covid_global = pd.read_csv('../data/covid_data/time_series_covid19_confirmed_global.csv')
with pd.option_context('display.max_columns', 1000, 'display.width', 1000, ):
    print(df_start.head())
    print(df_end.head())
    print(df_enter.head())
    print(f'{"="*30}')
    print(covid_us.head())
    print(covid_global.head())


  start_country  flights   month
0            US  1483858  202203
1            AU    86304  202203
2            GB    85760  202203
3            DE    81437  202203
4            CA    68039  202203
  end_country  flights   month
0          US  1486925  202203
1          GB    86927  202203
2          AU    86509  202203
3          DE    82075  202203
4          CA    68770  202203
  enter_country  flights   month
0            US    70347  202203
1            GB    48284  202203
2            DE    46315  202203
3            FR    35275  202203
4            ES    34631  202203
        UID iso2 iso3  code3    FIPS   Admin2 Province_State Country_Region        Lat      Long_          Combined_Key  1/22/20  1/23/20  1/24/20  1/25/20  1/26/20  1/27/20  1/28/20  1/29/20  1/30/20  1/31/20  2/1/20  2/2/20  2/3/20  2/4/20  2/5/20  2/6/20  2/7/20  2/8/20  2/9/20  2/10/20  2/11/20  2/12/20  2/13/20  2/14/20  2/15/20  2/16/20  2/17/20  2/18/20  2/19/20  2/20/20  2/21/20  2/22/20  2/23/20  2/24/20  

In [6]:
df_covid_day = covid_global.drop(columns=['Province/State', 'Lat', 'Long']).melt(id_vars=['Country/Region'], var_name='Date', value_name='Confirmed')
df_covid_day['Date'] = pd.to_datetime(df_covid_day['Date'], format='%m/%d/%y')
df_covid_month = df_covid_day.groupby(['Country/Region', pd.Grouper(key='Date', freq='ME')]).sum().reset_index().rename(columns={'Date': 'month'})
df_covid_month['month'] = df_covid_month['month'].dt.strftime('%Y%m')
df_covid_month.head()

# df_covid_month.to_csv('../data/covid_data/covid_global_monthly.csv', index=False)

Unnamed: 0,Country/Region,month,Confirmed
0,Afghanistan,202001,0
1,Afghanistan,202002,30
2,Afghanistan,202003,1141
3,Afghanistan,202004,25152
4,Afghanistan,202005,222720


In [16]:
# create a streamlit app to visualize the data using df_end and df_covid_month
# 1. time series graph showing the COVID case and flight volume within the US
# 2. a map showing the flight volume from each country

import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

# prepare data for the streamlit app
df_US = df_covid_month[df_covid_month['Country/Region'] == 'US'].merge(df_end[df_end['end_country'] == 'US'], on='month', how='left').rename(columns={'Confirmed': 'cases', 'Country/Region': 'country'}).drop(columns=['end_country'])
df_US.head()


# Set up the Streamlit app
st.title('COVID-19 Cases and Flight Volume Analysis')

# Create the main visualization
st.header('US COVID Cases and Flight Volume Over Time')

# Create a line plot with shared x-axis
fig = go.Figure()

# Add COVID cases line with improved formatting
fig.add_trace(
    go.Scatter(
        x=df_US['month'], 
        y=df_US['cases'], 
        name='COVID Cases',
        line=dict(color='#FF4B4B', width=3)
    )
)

# Add flight volume line with improved formatting
fig.add_trace(
    go.Scatter(
        x=df_US['month'], 
        y=df_US['flights'], 
        name='Flight Volume',
        line=dict(color='#1F77B4', width=3)
    )
)

# Update layout with improved styling
fig.update_layout(
    xaxis=dict(
        title='Month',
        tickangle=45
    ),
    yaxis=dict(title='Count'),
    title='US COVID Cases and Flight Volume Trends',
    hovermode='x unified',
    plot_bgcolor='white',
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=0.01
    )
)

st.plotly_chart(fig, use_container_width=True)

# Add correlation analysis
correlation = df_US['cases'].corr(df_US['flights'])
st.write(f"Correlation coefficient between cases and flights: {correlation:.2f}")

# Add explanatory text
st.markdown("""
### About this Visualization
This chart shows the relationship between COVID-19 cases and flight volumes in the United States over time:
- The red line represents the number of COVID-19 cases
- The blue line represents the flight volume
- Both metrics are plotted on the same scale to show relative changes

**Data Sources:**
- COVID-19 case data from Johns Hopkins CSSE
- Flight volume data from international aviation records
""")



DeltaGenerator()