In [85]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

In [86]:
# data = pd.read_csv('training_dataset/raw_dataset.csv')
# data = pd.read_csv('training_dataset/raw_scores_pvallog.csv')
# data = pd.read_csv('training_dataset/ionocyte_raw_dataset.csv')
data = pd.read_csv('training_dataset/ionocyte_scores_pvallog.csv')

In [87]:
data['disease_ontology_label'] = (data['disease_ontology_label'] == 'COVID-19').astype(int)
X = data.drop(['NAME', 'disease_ontology_label'], axis=1)
y = data['disease_ontology_label']

In [88]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)

In [89]:
# Feature selection using SelectKBest with f_classif
selector = SelectKBest(score_func=f_classif, k=60)  # Select top 100 features
X_train = selector.fit_transform(X_train, y_train)
X_test = selector.transform(X_test)

In [90]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [91]:
print("Class distribution in Training Set:")
print(y_train.value_counts())

print("Class distribution in Testing Set:")
print(y_test.value_counts())

Class distribution in Training Set:
disease_ontology_label
1    306
0    158
Name: count, dtype: int64
Class distribution in Testing Set:
disease_ontology_label
1    65
0    52
Name: count, dtype: int64


In [92]:
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
y_pred = lr_model.predict(X_test)

In [93]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')

print("--------------------------------------------------")
print("Logistic Regression Results")
print("--------------------------------------------------")

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")

print(classification_report(y_test, y_pred))

--------------------------------------------------
Logistic Regression Results
--------------------------------------------------
Accuracy: 0.7692307692307693
Precision: 0.7686930091185411
Recall: 0.7615384615384615
              precision    recall  f1-score   support

           0       0.77      0.69      0.73        52
           1       0.77      0.83      0.80        65

    accuracy                           0.77       117
   macro avg       0.77      0.76      0.76       117
weighted avg       0.77      0.77      0.77       117

