In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
df = pd.read_csv("covid_toy.csv")
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        100 non-null    int64  
 1   gender     100 non-null    object 
 2   fever      90 non-null     float64
 3   cough      100 non-null    object 
 4   city       100 non-null    object 
 5   has_covid  100 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 4.8+ KB


In [5]:
df.describe()

Unnamed: 0,age,fever
count,100.0,90.0
mean,44.22,100.844444
std,24.878931,2.054926
min,5.0,98.0
25%,20.0,99.0
50%,45.0,101.0
75%,66.5,102.75
max,84.0,104.0


In [6]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [7]:
from sklearn.impute import SimpleImputer
si = SimpleImputer()
df['fever'] = si.fit_transform(df[['fever']])

In [8]:
df.isnull().sum()

age          0
gender       0
fever        0
cough        0
city         0
has_covid    0
dtype: int64

In [9]:
from sklearn.preprocessing import LabelEncoder
lb=LabelEncoder()
y = lb.fit_transform(df["has_covid"])

In [10]:
#print(y[:10])

In [11]:
x = df.drop("has_covid", axis=1)
y = df["has_covid"]


In [12]:
print(y.value_counts())
print(y.value_counts(normalize=True)*100)


has_covid
No     55
Yes    45
Name: count, dtype: int64
has_covid
No     55.0
Yes    45.0
Name: proportion, dtype: float64


In [13]:
df.dtypes

age            int64
gender        object
fever        float64
cough         object
city          object
has_covid     object
dtype: object

In [14]:
from sklearn.model_selection import train_test_split 
x_train, x_test, y_train , y_test = train_test_split(x,y,test_size=0.2,
                                                    random_state=42)

In [18]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

num_cols = ["age", "fever"]
cat_cols = ["gender", "cough", "city"]

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(drop="first"), cat_cols)
])


In [19]:
x_train = preprocessor.fit_transform(x_train)


In [20]:
x_test = preprocessor.transform(x_test)


In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr = LogisticRegression()
lr.fit(x_train, y_train)

y_pred = lr.predict(x_test)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))


Logistic Regression Accuracy: 0.45


In [24]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(x_train, y_train)

y_pred = knn.predict(x_test)

print("KNN Accuracy:", accuracy_score(y_test, y_pred))


KNN Accuracy: 0.5


In [25]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)

y_pred = dt.predict(x_test)

print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred))


Decision Tree Accuracy: 0.4


In [26]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=200)
rf.fit(x_train, y_train)

y_pred = rf.predict(x_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred))


Random Forest Accuracy: 0.55


In [27]:
from sklearn.svm import SVC

svm = SVC()
svm.fit(x_train, y_train)

y_pred = svm.predict(x_test)

print("SVM Accuracy:", accuracy_score(y_test, y_pred))


SVM Accuracy: 0.6


In [30]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

nb = GaussianNB()

# Fit model
nb.fit(x_train, y_train)

# Predict
y_pred = nb.predict(x_test)

print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred))


Naive Bayes Accuracy: 0.4


In [41]:
results = []

models = {
    "Logistic Regression": LogisticRegression(),
    "KNN": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "Naive Bayes": GaussianNB()
}

best_model = None
best_accuracy = 0

for name, model in models.items():
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)

    acc = accuracy_score(y_test, y_pred)

    results.append((name, acc))

    if acc > best_accuracy:
        best_accuracy = acc
        best_model = name

df_results = pd.DataFrame(results, columns=["Model", "Accuracy"])

for model, acc in results:
    print(f"Model: {model}  Accuracy: {acc:.2f}")

print("Best Model:", best_model)
print("Best Accuracy:", best_accuracy)



Model: Logistic Regression  Accuracy: 0.45
Model: KNN  Accuracy: 0.50
Model: Decision Tree  Accuracy: 0.40
Model: Random Forest  Accuracy: 0.55
Model: SVM  Accuracy: 0.60
Model: Naive Bayes  Accuracy: 0.40
Best Model: SVM
Best Accuracy: 0.6
