# Introduction

## Improting required libraries

In [7]:
# Core libraries for data manipulation
import numpy as np            # For numerical operations
import pandas as pd           # For structured data (DataFrame) manipulation

# Regular expressions for pattern matching
import re

# Visualization libraries
import matplotlib.pyplot as plt   # For basic plotting
import seaborn as sns             # For statistical plots and visual styles

# Dask for out-of-core and parallel data processing (large datasets)
import dask.dataframe as dd

# Glob for file path matching (e.g., loading multiple CSV files at once)
import glob

## Code to check and review each CSV

In [8]:
def eval_df(dataframe):
    """
    Evaluate the structure and quality of a pandas DataFrame.
    
    This function prints:
    - Data types and memory usage
    - Columns with missing values
    - Count of duplicate rows
    - Summary statistics (numeric and categorical)
    
    Parameters:
    -----------
    dataframe : pd.DataFrame
        The DataFrame to evaluate.
    """
    # Display data types and basic memory usage
    print("\n🔍 DATA TYPES & MEMORY USAGE")
    print("-" * 40)
    print(dataframe.info())

    # Check and display missing values per column
    print("\n🧩 MISSING VALUES PER COLUMN")
    print("-" * 40)
    missing_values = dataframe.isnull().sum()
    print(missing_values[missing_values > 0])

    # Check for duplicate rows
    duplicates = dataframe.duplicated().sum()
    print("\n🗃️ DUPLICATE ROWS FOUND")
    print("-" * 40)
    print(f"{duplicates} duplicate rows found.")

    # Display summary statistics (for both numeric and object types)
    print("\n📊 SUMMARY STATISTICS")
    print("-" * 40)
    print(dataframe.describe())

## Get the sample countries  
- this would include those that have mobility data, policy data, covid data and country statistics data

In [9]:
# To get a common code for the countries I would use the ISO 3166-1 alpha-3
# Load country and country code reference data
COUNTRY_CODE = pd.read_csv("data/country_codes.csv")

# Preview the first few rows
COUNTRY_CODE.head()

Unnamed: 0,id,alpha2,alpha3,en
0,4,af,afg,Afghanistan
1,8,al,alb,Albania
2,12,dz,dza,Algeria
3,20,ad,and,Andorra
4,24,ao,ago,Angola


In [10]:
# Capitalise all codes in alpha2 and alpha3
COUNTRY_CODE['alpha2'] = COUNTRY_CODE['alpha2'].str.upper()
COUNTRY_CODE['alpha3'] = COUNTRY_CODE['alpha3'].str.upper() 

In [None]:
COUNTRY_CODE = COUNTRY_CODE.rename(columns = {
    'alpha2': 'code2',
    'alpha3' : 'Code',
    'en' : 'Country'
})

### Get the mobility data

In [11]:
# Columns to include (mobility trends + region info + date)
usecols = [
    "country_region_code", 
    "country_region", 
    "sub_region_1", 
    "sub_region_2", 
    "metro_area",
    "date", 
    "retail_and_recreation_percent_change_from_baseline",
    "grocery_and_pharmacy_percent_change_from_baseline",
    "parks_percent_change_from_baseline",
    "transit_stations_percent_change_from_baseline",
    "workplaces_percent_change_from_baseline",
    "residential_percent_change_from_baseline"
]

# Explicit dtypes to avoid Dask inference issues
dtype_fix = {
    'sub_region_1': 'object',
    'sub_region_2': 'object',
    'metro_area': 'object',
}

# Load the dataset with parsing and type fixes
df = dd.read_csv(
    'data/global_mobility_report.csv',
    usecols=usecols,
    dtype=dtype_fix,
    parse_dates=['date'],
    assume_missing=True
)

# Filter: only country-level data + dates up to the end of 2022
missing_cols = ['sub_region_1', 'sub_region_2', 'metro_area']
filtered = df[
    df[missing_cols].isnull().all(axis=1) &
    (df['date'] <= '2022-12-31')
]

# Safely preview the filtered result
print(filtered.sample(frac = 0.0001, random_state = 1).compute())

       country_region_code       country_region sub_region_1 sub_region_2  \
222368                  BF         Burkina Faso         <NA>         <NA>   
29194                   EC              Ecuador         <NA>         <NA>   
46188                   HN             Honduras         <NA>         <NA>   
276182                  MU            Mauritius         <NA>         <NA>   
7811                    JP                Japan         <NA>         <NA>   
266334                  MD              Moldova         <NA>         <NA>   
172365                  PG     Papua New Guinea         <NA>         <NA>   
308455                  TH             Thailand         <NA>         <NA>   
155735                  TT  Trinidad and Tobago         <NA>         <NA>   
303624                  ZW             Zimbabwe         <NA>         <NA>   

       metro_area       date  \
222368       <NA> 2021-01-04   
29194        <NA> 2020-05-30   
46188        <NA> 2022-02-17   
276182       <NA> 2022-0

In [12]:


# --- New Code to Check Latest Date per Country ---
# Group by country and find the maximum (latest) date for each
latest_dates = filtered.groupby('country_region')['date'].max()

# Compute the result (this triggers the Dask computation)
latest_dates_computed = latest_dates.compute()

# Optional: Sort the results by date to see which countries have the most recent data
print("\n--- Latest Date per Country (Sorted) ---")
print(latest_dates_computed.sort_values(ascending=False))



--- Latest Date per Country (Sorted) ---
country_region
Yemen        2022-10-15
Thailand     2022-10-15
Barbados     2022-10-15
Aruba        2022-10-15
Australia    2022-10-15
                ...    
Kyrgyzstan   2022-10-15
Japan        2022-10-15
Venezuela    2022-10-15
Uruguay      2022-10-15
Ukraine      2022-02-23
Name: date, Length: 135, dtype: datetime64[ns]


In [None]:

# rename country_region_code and country_region
filtered = filtered.rename(columns = {
    'country_region_code' : 'code2',
    'country_region' : 'Country'
}).compute()

# get a dataframe for the unique countries in the mobility data
countries_in_mob = filtered[['country_region_code', 'country_region']].drop_duplicates().compute()