In [15]:
#importing all necessary libraries
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

In [25]:
#Reading data into a DataFrame
df = pd.read_csv('GDSI_OpenDataset_Final.csv')



Data preparation

In [23]:
# Columns that need imputation of missing values
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

# initializing SimpleImputers
numeric_imputer = SimpleImputer(strategy='median')
categorical_imputer = SimpleImputer(strategy='most_frequent')

# Applying imputation
df[numeric_cols] = numeric_imputer.fit_transform(df[numeric_cols])
df[categorical_cols] = categorical_imputer.fit_transform(df[categorical_cols])

# Encoding categorical variables using One-Hot Encoding
encoder = OneHotEncoder(sparse_output=False, drop='first')  # Drop first to avoid dummy variable trap
categorical_encoded = encoder.fit_transform(df[categorical_cols])

# Creating a new DataFrame from the encoded categories
encoded_features = encoder.get_feature_names_out(categorical_cols)
encoded_df = pd.DataFrame(categorical_encoded, columns=encoded_features)

# Combine the numeric and encoded categorical data
cleaned_df = pd.concat([df[numeric_cols], encoded_df], axis=1)

cleaned_df.head()

Unnamed: 0,age_in_cat,edss_in_cat2,year_onset,covid19_outcome_levels_2,duration_treatment_cat2,comorbidities_other,secret_name_C_1006,secret_name_C_1007,secret_name_C_1008,secret_name_C_1037,...,stop_or_end_date_combined_30/10/2019,stop_or_end_date_combined_30/11/2018,stop_or_end_date_combined_30/12/2018,stop_or_end_date_combined_30/12/2019,stop_or_end_date_combined_31/01/2020,stop_or_end_date_combined_31/03/2020,stop_or_end_date_combined_31/05/2020,stop_or_end_date_combined_31/07/2019,stop_or_end_date_combined_31/10/2018,stop_or_end_date_combined_31/12/2019
0,1.0,0.0,2011.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,2011.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,2011.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,2011.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,2007.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
# Standardizing the data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(cleaned_df)

# Initializing PCA
pca = PCA(n_components=0.95)  # retain 95% of the variance
principal_components = pca.fit_transform(data_scaled)

# Create a DataFrame for the principal components
pca_df = pd.DataFrame(data=principal_components, columns=[f"PC{i+1}" for i in range(principal_components.shape[1])])

# Explained variance
explained_variance_ratio = pca.explained_variance_ratio_
explained_variance_ratio, pca_df.head()

(array([0.00498316, 0.00360121, 0.0030884 , ..., 0.00066371, 0.00066371,
        0.00066371]),
          PC1       PC2       PC3       PC4       PC5       PC6       PC7  \
 0   2.145285 -0.020181  2.179180 -1.085207 -2.513914 -1.223392 -2.040321   
 1   4.474103  2.354968 -0.734709 -2.579584 -4.499734 -2.856085 -2.790246   
 2   1.724622 -0.056000  2.249355 -1.264034 -2.206684 -1.329908 -3.075933   
 3   1.520207 -0.762623  1.138026  0.140034 -1.803928 -0.723880 -0.980825   
 4  11.112264  5.948784  2.519901 -0.321223  2.154849 -1.017531 -4.724634   
 
         PC8       PC9      PC10  ...    PC1055    PC1056    PC1057    PC1058  \
 0 -0.298320 -0.026329  1.087753  ... -0.000000  0.000000 -0.000000  0.000000   
 1 -0.909323 -0.520713  1.089430  ... -0.236434 -0.057064  0.194976  0.275836   
 2 -0.417155  0.043366  0.834293  ... -1.458186  0.035011  0.668714 -0.719717   
 3  1.244246 -0.547243  0.943277  ...  0.354969  0.262245  0.211120 -0.186381   
 4  1.355134  3.281877 -2.260481  ..

Feature selection

In [20]:
# Extract the original target variable from the unmodified dataset and check its content
original_target = data['covid19_outcome_recovered']
original_target.value_counts()

# Filter out 'not_applicable' and encode the outcomes
filtered_target = original_target[original_target != 'not_applicable']
encoded_target = (filtered_target == 'yes').astype(int)

# Match the indices of the PCA DataFrame to the filtered target
filtered_pca_df = pca_df.loc[filtered_target.index]

# Split the data into training and testing sets with the filtered and encoded target
X_train_filtered, X_test_filtered, y_train_filtered, y_test_filtered = train_test_split(
    filtered_pca_df, encoded_target, test_size=0.2, random_state=42
)

Model Training

In [26]:


# Train the logistic regression model on the filtered data
model_filtered = LogisticRegression(max_iter=1000)
model_filtered.fit(X_train_filtered, y_train_filtered)

# Predict on the testing set with the new model
y_pred_filtered = model_filtered.predict(X_test_filtered)

# Calculate accuracy and classification report for the new target
accuracy_filtered = accuracy_score(y_test_filtered, y_pred_filtered)
class_report_filtered = classification_report(y_test_filtered, y_pred_filtered)

print(accuracy_filtered)
print(class_report_filtered)

0.98
              precision    recall  f1-score   support

           0       1.00      0.93      0.96        14
           1       0.97      1.00      0.99        36

    accuracy                           0.98        50
   macro avg       0.99      0.96      0.97        50
weighted avg       0.98      0.98      0.98        50



Cross-Validation

In [28]:
from sklearn.model_selection import cross_val_score, cross_validate

# Using cross_validate to get more detailed results including fit time and score time
scoring_metrics = ['accuracy', 'precision', 'recall', 'f1']
cv_results = cross_validate(model_filtered, filtered_pca_df, encoded_target, cv=5, scoring=scoring_metrics)

# Calculate average scores across all folds for each metric
average_scores = {metric: cv_results[f'test_{metric}'].mean() for metric in scoring_metrics}
average_scores

{'accuracy': 0.8674285714285714,
 'precision': 0.8292634740003161,
 'recall': 0.9935483870967742,
 'f1': 0.9039078993050822}

Forecast 

In [16]:
# Create a hypothetical new data point
# Using the mean of each principal component as a typical case
new_data_point = np.mean(filtered_pca_df, axis=0).values.reshape(1, -1)

# Predict the outcome using the logistic regression model
predicted_outcome = model_filtered.predict(new_data_point)
predicted_probability = model_filtered.predict_proba(new_data_point)

# Mapping prediction to the outcome
outcome_mapping = {0: "no", 1: "yes"}
predicted_outcome_str = outcome_mapping[predicted_outcome[0]]

predicted_outcome_str, predicted_probability



('yes', array([[0.16409794, 0.83590206]]))