# Introduction

## Improting required libraries

In [1]:
# Core libraries for data manipulation
import numpy as np            # For numerical operations
import pandas as pd           # For structured data (DataFrame) manipulation

# Regular expressions for pattern matching
import re

# Visualization libraries
import matplotlib.pyplot as plt   # For basic plotting
import seaborn as sns             # For statistical plots and visual styles

# Dask for out-of-core and parallel data processing (large datasets)
import dask.dataframe as dd

# Glob for file path matching (e.g., loading multiple CSV files at once)
import glob

## Code to check and review each CSV

In [2]:
def eval_df(dataframe):
    """
    Evaluate the structure and quality of a pandas DataFrame.
    
    This function prints:
    - Data types and memory usage
    - Columns with missing values
    - Count of duplicate rows
    - Summary statistics (numeric and categorical)
    
    Parameters:
    -----------
    dataframe : pd.DataFrame
        The DataFrame to evaluate.
    """
    # Display data types and basic memory usage
    print("\n🔍 DATA TYPES & MEMORY USAGE")
    print("-" * 40)
    print(dataframe.info())

    # Check and display missing values per column
    print("\n🧩 MISSING VALUES PER COLUMN")
    print("-" * 40)
    missing_values = dataframe.isnull().sum()
    print(missing_values[missing_values > 0])

    # Check for duplicate rows
    duplicates = dataframe.duplicated().sum()
    print("\n🗃️ DUPLICATE ROWS FOUND")
    print("-" * 40)
    print(f"{duplicates} duplicate rows found.")

    # Display summary statistics (for both numeric and object types)
    print("\n📊 SUMMARY STATISTICS")
    print("-" * 40)
    print(dataframe.describe())

## Get the sample countries  
- this would include those that have mobility data, policy data, covid data and country statistics data

In [5]:
# To get a common code for the countries I would use the ISO 3166-1 alpha-3
# Load country and country code reference data
COUNTRY_CODE = pd.read_csv("data/country_codes.csv")

# Preview the first few rows
COUNTRY_CODE.head()

Unnamed: 0,id,alpha2,alpha3,en
0,4,af,afg,Afghanistan
1,8,al,alb,Albania
2,12,dz,dza,Algeria
3,20,ad,and,Andorra
4,24,ao,ago,Angola


In [6]:
# Capitalise all codes in alpha2 and alpha3
COUNTRY_CODE['alpha2'] = COUNTRY_CODE['alpha2'].str.upper()
COUNTRY_CODE['alpha3'] = COUNTRY_CODE['alpha3'].str.upper() 

### Get the mobility data

In [7]:
# Columns to exclude from loading
excluded_cols = ['iso_3166_2_code', 'census_flips_code', 'place_id']

# Sample the file to get full column list, then exclude unwanted ones
all_cols = dd.read_csv('data/global_mobility_report.csv', sample=10000).columns.tolist()
usecols = [col for col in all_cols if col not in excluded_cols]

# Load the dataset using only the desired columns
# assume_missing=True is important for mixed types in large files
mobility_data = dd.read_csv('data/global_mobility_report.csv', usecols=usecols, assume_missing=True)

In [8]:
# Columns to include (mobility trends + region info + date)
usecols = [
    "country_region_code", "country_region", "sub_region_1", "sub_region_2", "metro_area",
    "date", "retail_and_recreation_percent_change_from_baseline",
    "grocery_and_pharmacy_percent_change_from_baseline",
    "parks_percent_change_from_baseline",
    "transit_stations_percent_change_from_baseline",
    "workplaces_percent_change_from_baseline",
    "residential_percent_change_from_baseline"
]

# Explicit dtypes to avoid Dask inference issues
dtype_fix = {
    'sub_region_1': 'object',
    'sub_region_2': 'object',
    'metro_area': 'object'
}

# Load the dataset with parsing and type fixes
df = dd.read_csv(
    'data/global_mobility_report.csv',
    usecols=usecols,
    dtype=dtype_fix,
    parse_dates=['date'],
    assume_missing=True
)

# Filter: only country-level data + dates up to Jan 31, 2021
missing_cols = ['sub_region_1', 'sub_region_2', 'metro_area']
filtered = df[
    df[missing_cols].isnull().all(axis=1) &
    (df['date'] <= '2022-12-31')
]

# Safely preview the filtered result
print(filtered.head(10))

  country_region_code        country_region sub_region_1 sub_region_2  \
0                  AE  United Arab Emirates         <NA>         <NA>   
1                  AE  United Arab Emirates         <NA>         <NA>   
2                  AE  United Arab Emirates         <NA>         <NA>   
3                  AE  United Arab Emirates         <NA>         <NA>   
4                  AE  United Arab Emirates         <NA>         <NA>   
5                  AE  United Arab Emirates         <NA>         <NA>   
6                  AE  United Arab Emirates         <NA>         <NA>   
7                  AE  United Arab Emirates         <NA>         <NA>   
8                  AE  United Arab Emirates         <NA>         <NA>   
9                  AE  United Arab Emirates         <NA>         <NA>   

  metro_area       date  retail_and_recreation_percent_change_from_baseline  \
0       <NA> 2020-02-15                                                0.0    
1       <NA> 2020-02-16               