# Covid-19 Dataset 

import Libraries

In [144]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix



Load The dataset 

In [145]:
df=pd.read_csv(r"D:\python ka chilla 2023\Day_19\PK COVID-19-10may.csv")
df.head(2)

Unnamed: 0,Date,Cases,Deaths,Recovered,Travel_history,Province,City
0,2/26/2020,1,0,0,China,Islamabad Capital Territory,Islamabad
1,2/26/2020,2,0,0,Iran/Taftan,Sindh,Karachi


Data Preprocess

In [146]:
df.shape

(1328, 7)

In [147]:
df.isnull().sum()

Date              0
Cases             0
Deaths            0
Recovered         0
Travel_history    0
Province          0
City              0
dtype: int64

In [148]:
df.duplicated().sum()


np.int64(1)

In [149]:
# remove duplicated
df = df.drop_duplicates()

In [150]:
df.duplicated().sum()

np.int64(0)

In [151]:
df.describe()

Unnamed: 0,Cases,Deaths,Recovered
count,1327.0,1327.0,1327.0
mean,22.664657,0.455916,2.468726
std,83.006295,1.805633,11.865907
min,0.0,0.0,0.0
25%,1.0,0.0,0.0
50%,3.0,0.0,0.0
75%,10.0,0.0,0.0
max,1080.0,31.0,208.0


In [152]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1327 entries, 0 to 1327
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Date            1327 non-null   object
 1   Cases           1327 non-null   int64 
 2   Deaths          1327 non-null   int64 
 3   Recovered       1327 non-null   int64 
 4   Travel_history  1327 non-null   object
 5   Province        1327 non-null   object
 6   City            1327 non-null   object
dtypes: int64(3), object(4)
memory usage: 82.9+ KB


In [153]:
# drop date column
df.drop('Date', axis=1, inplace=True)

In [154]:
df.head(2)

Unnamed: 0,Cases,Deaths,Recovered,Travel_history,Province,City
0,1,0,0,China,Islamabad Capital Territory,Islamabad
1,2,0,0,Iran/Taftan,Sindh,Karachi


In [155]:
from sklearn.preprocessing import LabelEncoder
label_encoder=LabelEncoder()
# Encoding categorical columns

cat_col=["Province","City"]

for col in cat_col:
    df[col] = label_encoder.fit_transform(df[col])
    df[col] = df[col].astype(int)
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    df[col] = scaler.fit_transform(df[[col]])
df.head(2)


Unnamed: 0,Cases,Deaths,Recovered,Travel_history,Province,City
0,1,0,0,China,-0.658888,-0.622814
1,2,0,0,Iran/Taftan,1.123064,-0.442223


In [156]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Scaling numerical columns
num_col=["Cases","Deaths","Recovered"]
for col in num_col:
    df[col] = scaler.fit_transform(df[[col]])
df.head(2)

Unnamed: 0,Cases,Deaths,Recovered,Travel_history,Province,City
0,-0.261099,-0.252591,-0.208131,China,-0.658888,-0.622814
1,-0.249047,-0.252591,-0.208131,Iran/Taftan,1.123064,-0.442223


In [157]:
df.describe()

Unnamed: 0,Cases,Deaths,Recovered,Province,City
count,1327.0,1327.0,1327.0,1327.0,1327.0
mean,1.070901e-17,1.070901e-17,1.070901e-17,4.283604e-17,-7.228581e-17
std,1.000377,1.000377,1.000377,1.000377,1.000377
min,-0.2731504,-0.2525915,-0.2081305,-3.034824,-1.766557
25%,-0.2610986,-0.2525915,-0.2081305,-0.06490406,-0.8636022
50%,-0.2369949,-0.2525915,-0.2081305,-0.06490406,0.03935259
75%,-0.1526322,-0.2525915,-0.2081305,0.52908,0.8821104
max,12.74282,16.92238,17.32769,2.311032,2.025853


In [158]:
# change the datatypes 
# df["Travel_history"]=df["Travel_history"].astype(int)   
df["Province"]=df["Province"].astype(int)
df["City"]=df["City"].astype(int)

In [159]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1327 entries, 0 to 1327
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Cases           1327 non-null   float64
 1   Deaths          1327 non-null   float64
 2   Recovered       1327 non-null   float64
 3   Travel_history  1327 non-null   object 
 4   Province        1327 non-null   int64  
 5   City            1327 non-null   int64  
dtypes: float64(3), int64(2), object(1)
memory usage: 72.6+ KB


In [160]:
X=df.drop("Travel_history",axis=1)      

In [161]:
X.head(2)

Unnamed: 0,Cases,Deaths,Recovered,Province,City
0,-0.261099,-0.252591,-0.208131,0,0
1,-0.249047,-0.252591,-0.208131,1,0


In [162]:
# Define your sets
foreign_set = {"India", "USA", "China", "Dubai", "Syria", "KSA", "UK", "International Passenger"}
local_set   = {"Tableeghi Jamaat", "Afghanistan", "Iran/Taftan", "Jail",
               "Local - Social Contact", "Unknown", "Local - Covid Relative"}

# Create an empty list to store results
travel_type_list = []

# Loop through each row
for value in df["Travel_history"]:
    if value in foreign_set:
        travel_type_list.append("Foreign_Travel")
    elif value in local_set:
        travel_type_list.append("Local_Travel")
    else:
        travel_type_list.append("Local_Travel")  # Default for unexpected values

# Add list to DataFrame as new column
df["Travel_History"] = travel_type_list


In [163]:
df["Travel_History"].value_counts()


Travel_History
Local_Travel      1309
Foreign_Travel      18
Name: count, dtype: int64

In [164]:
y=df["Travel_History"]

In [165]:
y.head(2)

0    Foreign_Travel
1      Local_Travel
Name: Travel_History, dtype: object

In [166]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
# Encoding the 'Travel_History' column
label_encoder.fit_transform(y)
# y["Travel_History"].astype(int)


array([0, 1, 0, ..., 1, 1, 1], shape=(1327,))

In [167]:
# maping the values
y = y.map({"Foreign_Travel": 1, "Local_Travel": 0})
y.head(2)

0    1
1    0
Name: Travel_History, dtype: int64

In [168]:

# X=df.drop("Travel_History",axis=1)
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [169]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1061, 5), (266, 5), (1061,), (266,))

In [170]:
y_train.value_counts()

Travel_History
0    1047
1      14
Name: count, dtype: int64

In [171]:
models = [LogisticRegression(), SVC(), DecisionTreeClassifier(), RandomForestClassifier(), KNeighborsClassifier()]
models_name = ["LogisticRegression", "SVC", "DecisionTreeClassifier", "RandomForestClassifier", "KNeighborsClassifier"]
models_score = []
models_score=[]
for model,models_name in zip(models,models_name):
    model.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    accuracy=accuracy_score(y_test,y_pred)
    f1=f1_score(y_test,y_pred)
    precision=precision_score(y_test,y_pred)
    recall=recall_score(y_test,y_pred)
    models_score.append([models_name,accuracy,f1,precision,recall])
models_score=pd.DataFrame(models_score,columns=["Model","Accuracy","F1","Precision","Recall"]) # convert to dataframe
models_score.sort_values(by="Accuracy",ascending=False,inplace=True) # sort by accuracy
models_score.reset_index(drop=True,inplace=True) # reset index
models_score



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,Model,Accuracy,F1,Precision,Recall
0,LogisticRegression,0.984962,0.0,0.0,0.0
1,SVC,0.984962,0.0,0.0,0.0
2,DecisionTreeClassifier,0.984962,0.0,0.0,0.0
3,RandomForestClassifier,0.984962,0.0,0.0,0.0
4,KNeighborsClassifier,0.984962,0.0,0.0,0.0
