In [None]:
# ==================================================
# Project Title: (do this as markdown above)
# ==================================================

# Created by: Your Name
# Date: Today's Date
# Description: This script automates the deployment of a web application.


In [None]:
# ==================================================
# Setup Basic DevOps
# ==================================================

# Create GitHub repo
#   via GitHub website 
# Create directory for experiment
#   mkdir folder_name
# Initialize Git repository
#   git init
# Create a virtual environment
#   python -m venv venv
#   source venv/bin/activate
# Install dependencies (pyyaml, ipykernel, )
#    pip install -r requirements.txt
# Set up experiment folder
# Set working directory


In [10]:
# ==================================================
# Experiment Module Setup 
# ==================================================

import os
import yaml 

def create_experiment_modules (yaml_path):

    """
    Creates experiment modules based on a YAML configuration file.

    Args:
        yaml_path (str): Path to the YAML configuration file.
    Returns: 
        None 
    """
    # Error handling
    if not os.path.exists(yaml_path):
        raise FileNotFoundError(f"YAML file not found: {yaml_path}")   
    
    with open(yaml_path, 'r') as file:
        module_config = yaml.safe_load(file)
    
    assert module_config is not None, "Failed to load YAML configuration." 
    assert "module_names" in module_config, "'module_names' key is missing in the YAML configuration."
    assert module_config["module_names"], "Module names cannot be empty."

    module_names = module_config["module_names"]
    for module_name, subnames in module_names.items():
        if module_name:
            os.makedirs(module_name, exist_ok=True)
            init_file_path = os.path.join(module_name, "__init__.py")
            if not os.path.exists(init_file_path):
                with open(init_file_path, 'w') as init_file:
                    init_file.write("# =============== Make this a module========================= \n")

        if subnames:
            for subname in subnames:
                py_file_path = os.path.join(module_name, f"{subname}.py")
                if not os.path.exists(py_file_path):
                    with open(py_file_path, 'w') as py_file:
                        py_file.write("# ======================================== \n")
                        py_file.write(f"# {subname} module in {module_name}\n")
                        py_file.write("# ======================================== \n")

    other_assets = ["README.md", "requirements.txt", ".env", "app.py", ".gitlab-ci.yml", "TODOs.md", "release_notes.md"]
    for asset in other_assets:
        asset_path = os.path.join(".", asset)
        if not os.path.exists(asset_path):
            with open(asset_path, 'w') as a:
                a.write(f"# {asset} file\n")

create_experiment_modules("experiment_modules.yaml")


In [None]:
# ==================================================
# Set working directory
# ==================================================


In [5]:
# ==================================================
# Dependencies
# ==================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import classification_report, confusion_matrix



In [6]:
# ===============================================
# Data Ingestion
# ===============================================

# Load the data from the CSV file
ukhsa_labs_ref_df = pd.read_csv('asset/lab_details.csv')
tests_df = pd.read_csv('asset/testing_data.csv')




In [7]:
ukhsa_labs_ref_df.head(10)


Unnamed: 0,lab_code,lab_lat_lng,lab_loc
0,BI4658,"52.4691353, -1.8797023",Birmingham
1,CA477T,"52.2208471, 0.1156700",Cambridge
2,BR46679,"53.8124860, -1.7592994",Bradford
3,LO4685,"51.3885652, -0.4435039\t",London
4,MA7896,"53.4740918, -2.2551303",Manchester


# Data Understanding & Quality Assessment


In [8]:
tests_df.dtypes



Unnamed: 0                 int64
name                      object
gender                    object
lab                       object
home_lat_lon              object
date of birth             object
swab_location             object
test_result               object
test_taken_date           object
lab_process_test_date     object
ct                       float64
dtype: object

In [9]:
tests_df.head()


Unnamed: 0.1,Unnamed: 0,name,gender,lab,home_lat_lon,date of birth,swab_location,test_result,test_taken_date,lab_process_test_date,ct
0,0,Elijah Meadows,m,MA7896,"(53.513882907154546, -2.3563769396738397)",2007-09-10,mouth,correct,2025-07-25,2025-08-03,7.279866
1,1,Jonathan Edwards,M,BI4658,"(52.424117496929135, -1.8253721338717945)",2006-01-12,hair,correct,2025-07-17,2025-07-20,10.422405
2,2,Dr. Seth Beck Jr.,M,LO4685,"(51.33262171316983, -0.5658403204135098)",2011-09-27,skin(,correct,2025-07-19,2025-07-24,10.338574
3,3,Mr. Todd Brown,M,MA7896,"(53.519623933090415, -2.0479240415460436)",2003-09-13,hair,correct,2025-07-17,2025-07-24,5.033647
4,4,Gabriel Kelly,m,LO4685,"(51.20198753593912, -0.4404987467629505)",1994-09-18,hair,correct,2025-07-27,2025-08-01,6.632064


In [10]:
tests_df = tests_df.rename(columns={'Unnamed: 0': 'sn'})

In [11]:
tests_df.head()


Unnamed: 0,sn,name,gender,lab,home_lat_lon,date of birth,swab_location,test_result,test_taken_date,lab_process_test_date,ct
0,0,Elijah Meadows,m,MA7896,"(53.513882907154546, -2.3563769396738397)",2007-09-10,mouth,correct,2025-07-25,2025-08-03,7.279866
1,1,Jonathan Edwards,M,BI4658,"(52.424117496929135, -1.8253721338717945)",2006-01-12,hair,correct,2025-07-17,2025-07-20,10.422405
2,2,Dr. Seth Beck Jr.,M,LO4685,"(51.33262171316983, -0.5658403204135098)",2011-09-27,skin(,correct,2025-07-19,2025-07-24,10.338574
3,3,Mr. Todd Brown,M,MA7896,"(53.519623933090415, -2.0479240415460436)",2003-09-13,hair,correct,2025-07-17,2025-07-24,5.033647
4,4,Gabriel Kelly,m,LO4685,"(51.20198753593912, -0.4404987467629505)",1994-09-18,hair,correct,2025-07-27,2025-08-01,6.632064


In [12]:
tests_df.columns.tolist()



['sn',
 'name',
 'gender',
 'lab',
 'home_lat_lon',
 'date of birth',
 'swab_location',
 'test_result',
 'test_taken_date',
 'lab_process_test_date',
 'ct']

In [13]:
def check_missing_values(df):
    """
    check for missing valus
    args:
        df (pd.DataFrame): input dataframe
    returns:
        xxx
    """
    missing_summary = pd.Dataframe({
        'missing_count': df.isnull.sum(),
        'missing_percentage': df.isnull().sum()/len(df) * 100,
        'data_type': df.dtypes
    })
    missing_summary = missing_summary[missing_summary['missing_count'] > 0].sort_values('missing_count', ascending=False)

    return  missing_summary



In [14]:
check_missing_values(tests_df)

AttributeError: module 'pandas' has no attribute 'Dataframe'