In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
# 1-2 layers hidden layers of 8-32 neurons

from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report

# CONSTANTS
TARGET = "cases_new_increase_tmr"
START_DATE = pd.to_datetime("2021-07-01")
END_DATE = START_DATE + pd.DateOffset(months=6)
NO_DAYS = (END_DATE - START_DATE).days + 1
dates = pd.date_range(start=START_DATE, end=END_DATE, freq='D')
print(f"No. days: {NO_DAYS}")
print(f"Date range: {START_DATE} to {END_DATE}")

# Base GitHub URL
BASE_URL = "https://raw.githubusercontent.com/MoH-Malaysia/covid19-public/main/"

# Relevant CSVs
files = {
    "cases_malaysia": BASE_URL + "epidemic/cases_malaysia.csv",
    "tests_malaysia": BASE_URL + "epidemic/tests_malaysia.csv",
    "checkin_malaysia": BASE_URL + "mysejahtera/checkin_malaysia.csv",
    # "deaths_malaysia": BASE_URL + "epidemic/deaths_malaysia.csv",
    # "hospital": BASE_URL + "epidemic/hospital.csv",
    # "icu": BASE_URL + "epidemic/icu.csv",
    # "vax_malaysia": BASE_URL + "vaccination/vax_malaysia.csv",
    # "trace_malaysia": BASE_URL + "mysejahtera/trace_malaysia.csv",
}


No. days: 185
Date range: 2021-07-01 00:00:00 to 2022-01-01 00:00:00


In [46]:
# Extract and laod all CSVs into DataFrames
dfs = {name: pd.read_csv(url) for name, url in files.items()}

# Clean data
for k, df in dfs.items():
    # Set date column as data
    if "date" in df.columns:
        df["date"] = pd.to_datetime(df["date"])
        df = df[(df["date"] >= START_DATE) & (df["date"] <= END_DATE)]

    # Drop columns where ALL values are null
    df = df.dropna(axis=1, how="all")

    # Clean data
    if k == "trace_malaysia":
        # Handle duplicates
        df = df.groupby("date", as_index=False).mean()

        # Interpolate data for missing dates
        df["date"] = pd.to_datetime(df["date"])
        df.set_index("date", inplace=True)
        df.sort_index(inplace=True)
        df = df.reindex(dates)
        cols = ["casual_contacts", "hide_large", "hide_small"]
        df[cols] = df[cols].interpolate(method="linear")
        df = df.reset_index().rename(columns={"index": "date"})

        # Fix columns type
        df[cols] = df[cols].round().astype(int)

    # Remove columns with one value only
    df = df.loc[:, df.nunique(dropna=True) > 1]

    # Save back into dictionary
    dfs[k] = df

    # Check shape
    output_filename = f"{k}.csv"
    print(f"{k}: {df.shape}")

# Merge data
data = pd.DataFrame({'date': dates})
for k, df in dfs.items():
    if k == "population":
        continue
    if df.shape[0] != NO_DAYS:
        df = df.drop(columns=["state"])
        df = df.groupby("date").sum()
    data = pd.merge(data, df, on="date", how="left")

cases_malaysia: (185, 24)
tests_malaysia: (185, 3)
checkin_malaysia: (185, 4)


In [47]:
# Include target cases_new_increase_tmr
data['cases_new_increase_tmr'] = (data['cases_new'].shift(-1) > data['cases_new']).astype(int)

# Drop last row (no target)
data = data[:-1]


# Combine features
data['tests_total'] = data['rtk-ag'] + data['pcr']  # combine tests
data['mobility_density'] = data['checkins'] / data['unique_loc']  # density of people per location
data['mobility_density'] = data['mobility_density'].replace([float('inf'), -float('inf')], 0).fillna(0)

# Lag features
lag_cols = ['cases_new', 'cases_active', 'cases_cluster', 'tests_total', 'mobility_density']
for col in lag_cols:
    data[f'{col}_shift1'] = data[col].shift(1)  # previous day

# Rolling averages (7-day)
for col in lag_cols:
    data[f'{col}_7d_avg'] = data[col].rolling(window=7).mean()  # 7-day avg

# Percent change
for col in lag_cols:
    data[f'{col}_pct_change'] = data[col].pct_change()  # daily pct change

# Day of week
data['day_of_week'] = data['date'].dt.dayofweek  # 0=Monday, 6=Sunday

# Drop rows with NaN caused by lag/rolling
data = data.dropna().reset_index(drop=True)  # clean data

# Target
y = data['cases_new_increase_tmr']  # target

# Features selection
feature_cols = [
    'cases_new', 'cases_new_shift1', 'cases_new_7d_avg', 'cases_new_pct_change',
    'cases_active', 'cases_active_shift1', 'cases_active_7d_avg', 'cases_active_pct_change',
    'cases_cluster', 'cases_cluster_shift1', 'cases_cluster_7d_avg', 'cases_cluster_pct_change',
    'tests_total', 'tests_total_shift1', 'tests_total_7d_avg', 'tests_total_pct_change',
    'mobility_density', 'mobility_density_shift1', 'mobility_density_7d_avg', 'mobility_density_pct_change',
    'day_of_week'
]

X = data[feature_cols]  # features

# Output shapes
print("X shape:", X.shape)
print("y shape:", y.shape)
print("Selected features:", feature_cols)


X shape: (178, 21)
y shape: (178,)
Selected features: ['cases_new', 'cases_new_shift1', 'cases_new_7d_avg', 'cases_new_pct_change', 'cases_active', 'cases_active_shift1', 'cases_active_7d_avg', 'cases_active_pct_change', 'cases_cluster', 'cases_cluster_shift1', 'cases_cluster_7d_avg', 'cases_cluster_pct_change', 'tests_total', 'tests_total_shift1', 'tests_total_7d_avg', 'tests_total_pct_change', 'mobility_density', 'mobility_density_shift1', 'mobility_density_7d_avg', 'mobility_density_pct_change', 'day_of_week']


In [None]:
import pandas as pd

def train_evaluate_model_df(model, param_grid, X, y, scale_data=False, n_splits=5):
    """
    Train and evaluate a model using TimeSeriesSplit + GridSearchCV, returns results in DataFrame.
    """
    tscv = TimeSeriesSplit(n_splits=n_splits)
    rows = []

    for fold, (train_idx, val_idx) in enumerate(tscv.split(X), 1):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        # Scaling if needed
        if scale_data:
            scaler = RobustScaler()
            X_train = scaler.fit_transform(X_train)
            X_val = scaler.transform(X_val)
        else:
            X_train = X_train.values
            X_val = X_val.values

        # GridSearchCV for hyperparameter tuning
        grid = GridSearchCV(model, param_grid, cv=3, scoring='f1', n_jobs=-1)
        grid.fit(X_train, y_train)

        best_model = grid.best_estimator_
        y_pred = best_model.predict(X_val)

        # Store fold metrics
        rows.append({
            'model': type(model).__name__,
            'fold': fold,
            'accuracy': accuracy_score(y_val, y_pred),
            'f1_score': f1_score(y_val, y_pred),
            'best_params': grid.best_params_,
            'confusion_matrix': confusion_matrix(y_val, y_pred)
        })

    df_results = pd.DataFrame(rows)
    df_avg = pd.DataFrame({
        'model': [type(model).__name__],
        'average_accuracy': [df_results['accuracy'].mean()],
        'average_f1': [df_results['f1_score'].mean()]
    })

    return df_results, df_avg

In [53]:
models = {
    "LogisticRegression": LogisticRegression(),
    "KNN": KNeighborsClassifier(),
    "SVC": SVC(),
    "RandomForest": RandomForestClassifier(random_state=42),
    "XGB": xgb.XGBClassifier(
        use_label_encoder=False, eval_metric="logloss", random_state=42
    ),
}

param_grids = {
    "LogisticRegression": {
        "C": [0.01, 0.1, 1, 10],
        "penalty": ["l2"],
        "solver": ["lbfgs", "liblinear"],
        "max_iter": [1000],
    },
    "KNN": {"n_neighbors": [3, 5, 7], "weights": ["uniform", "distance"], "p": [1, 2]},
    "SVC": {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"], "gamma": ["scale", "auto"]},
    "RandomForest": {
        "n_estimators": [50, 100],
        "max_depth": [None, 5, 10],
        "min_samples_split": [2, 5],
    },
    "XGB": {
        "n_estimators": [50, 100],
        "max_depth": [3, 5],
        "learning_rate": [0.01, 0.1],
    },
}

In [54]:
all_folds = []
all_avg = []

for name, model in models.items():
    scale_needed = name in ['LogisticRegression','KNN','SVC']
    df_folds, df_avg = train_evaluate_model_df(model, param_grids[name], X, y, scale_data=scale_needed)
    all_folds.append(df_folds)
    all_avg.append(df_avg)

# Combine all results
df_folds_combined = pd.concat(all_folds, ignore_index=True)
df_avg_combined = pd.concat(all_avg, ignore_index=True)

# Display
print("Fold-wise results:")
print(df_folds_combined)
print("\nAverage metrics per model:")
print(df_avg_combined)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold-wise results:
                     model  fold  accuracy  f1_score  \
0       LogisticRegression     1  0.793103  0.812500   
1       LogisticRegression     2  0.655172  0.666667   
2       LogisticRegression     3  0.862069  0.846154   
3       LogisticRegression     4  0.827586  0.838710   
4       LogisticRegression     5  0.862069  0.857143   
5     KNeighborsClassifier     1  0.655172  0.642857   
6     KNeighborsClassifier     2  0.724138  0.555556   
7     KNeighborsClassifier     3  0.896552  0.880000   
8     KNeighborsClassifier     4  0.827586  0.838710   
9     KNeighborsClassifier     5  0.896552  0.888889   
10                     SVC     1  0.482759  0.651163   
11                     SVC     2  0.344828  0.512821   
12                     SVC     3  0.758621  0.666667   
13                     SVC     4  0.689655  0.709677   
14                     SVC     5  0.896552  0.888889   
15  RandomForestClassifier     1  0.689655  0.666667   
16  RandomForestClassifier   

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
