In [1]:
import pandas as pd

## Reading Data from excel

In [2]:
df = pd.read_csv('data.csv')

In [3]:
df.head()

Unnamed: 0,fever,bodyPain,age,runnyNose,diffBreath,infectionProb
0,100.132027,1,67,1,0,1
1,99.761447,1,10,0,0,0
2,98.020578,1,75,1,1,1
3,99.237362,0,71,1,-1,1
4,100.025546,1,11,1,0,1


In [4]:
df.tail()

Unnamed: 0,fever,bodyPain,age,runnyNose,diffBreath,infectionProb
2512,99.391495,0,28,1,-1,0
2513,99.026513,1,9,1,0,1
2514,99.510565,1,24,0,0,1
2515,98.368854,1,9,1,1,1
2516,100.568328,1,42,0,0,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2517 entries, 0 to 2516
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   fever          2517 non-null   float64
 1   bodyPain       2517 non-null   int64  
 2   age            2517 non-null   int64  
 3   runnyNose      2517 non-null   int64  
 4   diffBreath     2517 non-null   int64  
 5   infectionProb  2517 non-null   int64  
dtypes: float64(1), int64(5)
memory usage: 118.1 KB


In [6]:
df['diffBreath'].value_counts()

-1    862
 0    842
 1    813
Name: diffBreath, dtype: int64

In [7]:
df.describe()

Unnamed: 0,fever,bodyPain,age,runnyNose,diffBreath,infectionProb
count,2517.0,2517.0,2517.0,2517.0,2517.0,2517.0
mean,99.990178,0.495034,44.497815,0.518872,-0.019468,0.504966
std,1.167727,0.500075,23.342879,0.499743,0.815696,0.500075
min,98.001093,0.0,5.0,0.0,-1.0,0.0
25%,98.97623,0.0,24.0,0.0,-1.0,0.0
50%,99.982257,0.0,44.0,1.0,0.0,1.0
75%,101.019143,1.0,64.0,1.0,1.0,1.0
max,101.999494,1.0,85.0,1.0,1.0,1.0


## Train Test splitting

In [8]:
import numpy as np

In [9]:
def data_split(data,ratio):
    np.random.seed(42)
    shuffled = np.random.permutation(len(data))
    test_set_size = int(len(data)*ratio)
    test_indices = shuffled[:test_set_size]
    train_indices = shuffled[test_set_size:]
    return data.iloc[train_indices],data.iloc[test_indices]


In [10]:
train,test = data_split(df,0.2)

In [11]:
train

Unnamed: 0,fever,bodyPain,age,runnyNose,diffBreath,infectionProb
495,98.319281,1,21,1,-1,1
1313,99.224491,0,62,1,-1,1
2150,100.657704,1,41,0,1,1
1159,101.816005,1,61,1,1,1
289,99.187054,1,46,0,1,0
...,...,...,...,...,...,...
1638,101.020426,1,67,1,-1,0
1095,99.317221,1,5,1,0,1
1130,101.563128,0,38,1,0,1
1294,101.118131,0,5,1,-1,0


In [12]:
test

Unnamed: 0,fever,bodyPain,age,runnyNose,diffBreath,infectionProb
2195,100.643341,0,63,0,0,0
410,100.821463,0,36,0,-1,1
1389,99.534547,1,71,0,1,1
888,101.671703,0,48,1,1,0
1771,99.937813,0,16,1,1,0
...,...,...,...,...,...,...
1309,99.799039,0,66,1,1,0
862,101.678877,0,30,0,-1,0
1814,98.821144,0,5,1,0,1
51,100.345534,0,58,1,-1,1


In [13]:
X_train = train[['fever','bodyPain','age','runnyNose','diffBreath']].to_numpy()
X_test = test[['fever','bodyPain','age','runnyNose','diffBreath']].to_numpy()

In [14]:
Y_train = train[['infectionProb']].to_numpy().reshape(2014,)
Y_test = test[['infectionProb']].to_numpy().reshape(503,)

In [15]:
Y_train

array([1, 1, 1, ..., 1, 0, 1])

In [16]:
from sklearn.linear_model import LogisticRegression

In [17]:
clf = LogisticRegression()
clf.fit(X_train,Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

## Using model for prediction

In [18]:
clf.predict_proba([[100,1,22,1,1]])

array([[0.4828493, 0.5171507]])

In [19]:
input_features = [100,1,22,1,1]
infectionProbability = clf.predict_proba([input_features])[0][1]
infectionProbability

0.5171506950282435