In [1]:
import pandas as pd

# Reading Data (for analyzing selected data)

In [2]:
df = pd.read_csv('data.csv')

In [3]:
df.head()

Unnamed: 0,fever,bodyPain,age,runnyNose,diffBreath,infectionProb
0,98.269067,1,21,1,0,1
1,98.198868,1,26,1,0,0
2,98.456659,1,53,0,1,0
3,98.961167,1,64,1,0,0
4,99.214807,1,54,0,1,0


In [4]:
df.tail()

Unnamed: 0,fever,bodyPain,age,runnyNose,diffBreath,infectionProb
2994,100.947953,0,71,1,0,1
2995,99.761955,1,32,1,1,0
2996,101.91664,1,78,1,-1,0
2997,98.453023,0,76,0,-1,0
2998,100.643353,0,53,0,-1,1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2999 entries, 0 to 2998
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   fever          2999 non-null   float64
 1   bodyPain       2999 non-null   int64  
 2   age            2999 non-null   int64  
 3   runnyNose      2999 non-null   int64  
 4   diffBreath     2999 non-null   int64  
 5   infectionProb  2999 non-null   int64  
dtypes: float64(1), int64(5)
memory usage: 140.7 KB


In [6]:
df['diffBreath'].value_counts() # Only for particular column

 1    1008
-1    1008
 0     983
Name: diffBreath, dtype: int64

In [7]:
df.describe() # Only for particular total data frame 

Unnamed: 0,fever,bodyPain,age,runnyNose,diffBreath,infectionProb
count,2999.0,2999.0,2999.0,2999.0,2999.0,2999.0
mean,100.040089,0.515505,49.650884,0.486829,0.0,0.48883
std,1.161372,0.499843,28.486444,0.49991,0.820029,0.499959
min,98.000163,0.0,1.0,0.0,-1.0,0.0
25%,99.022158,0.0,26.0,0.0,-1.0,0.0
50%,100.074408,1.0,49.0,0.0,0.0,0.0
75%,101.054208,1.0,74.0,1.0,1.0,1.0
max,101.996177,1.0,100.0,1.0,1.0,1.0


# Train Test Spliting (first step in machine learning)

In [8]:
import numpy as np

In [9]:
def data_split(data, ratio):
    np.random.seed(42)
    shuffled = np.random.permutation(len(data))
    test_set_size = int(len(data) * ratio)
    test_indices = shuffled[:test_set_size]
    train_indices = shuffled[test_set_size:]
    return data.iloc[train_indices],data.iloc[test_indices]

In [10]:
np.random.permutation(10) # just checking the function of permutation

array([3, 0, 7, 4, 6, 9, 8, 5, 2, 1])

In [11]:
train, test = data_split(df, 0.2)

In [12]:
train # This is our training data

Unnamed: 0,fever,bodyPain,age,runnyNose,diffBreath,infectionProb
1103,100.534337,0,4,0,-1,1
1569,98.543427,0,60,1,1,1
2229,101.665517,1,70,0,-1,0
2296,101.898668,0,61,0,0,0
1800,100.038146,0,13,0,1,1
...,...,...,...,...,...,...
1638,100.277199,1,93,1,-1,0
1095,99.038524,0,60,1,-1,0
1130,98.285530,0,11,0,-1,1
1294,101.260288,1,72,0,0,0


In [13]:
test # This is our testing data

Unnamed: 0,fever,bodyPain,age,runnyNose,diffBreath,infectionProb
1376,101.661306,0,54,0,-1,0
932,98.206556,1,18,1,1,1
144,99.587414,1,68,0,0,1
1752,101.936462,1,96,0,1,0
51,101.947269,0,9,1,0,1
...,...,...,...,...,...,...
842,98.526451,0,57,1,-1,1
637,99.084975,1,43,1,1,1
695,99.026527,0,51,0,1,0
226,101.902516,0,37,0,-1,0


In [14]:
X_train = train[['fever', 'bodyPain', 'age', 'runnyNose', 'diffBreath' ]]

In [15]:
X_train

Unnamed: 0,fever,bodyPain,age,runnyNose,diffBreath
1103,100.534337,0,4,0,-1
1569,98.543427,0,60,1,1
2229,101.665517,1,70,0,-1
2296,101.898668,0,61,0,0
1800,100.038146,0,13,0,1
...,...,...,...,...,...
1638,100.277199,1,93,1,-1
1095,99.038524,0,60,1,-1
1130,98.285530,0,11,0,-1
1294,101.260288,1,72,0,0


In [16]:
X_train.to_numpy()

array([[100.53433705,   0.        ,   4.        ,   0.        ,
         -1.        ],
       [ 98.54342745,   0.        ,  60.        ,   1.        ,
          1.        ],
       [101.66551711,   1.        ,  70.        ,   0.        ,
         -1.        ],
       ...,
       [ 98.28553025,   0.        ,  11.        ,   0.        ,
         -1.        ],
       [101.26028818,   1.        ,  72.        ,   0.        ,
          0.        ],
       [ 99.75399043,   0.        ,  10.        ,   0.        ,
         -1.        ]])

In [17]:
X_train = train[['fever', 'bodyPain', 'age', 'runnyNose', 'diffBreath' ]].to_numpy()
X_test = train[['fever', 'bodyPain', 'age', 'runnyNose', 'diffBreath' ]].to_numpy()

In [18]:
Y_train = train[['infectionProb' ]].to_numpy().reshape(2400)

In [19]:
Y_test = train[['infectionProb' ]].to_numpy().reshape(2400)

# To train machine learning modle

In [20]:
from sklearn.linear_model import LogisticRegression

In [21]:
clf = LogisticRegression()
clf.fit(X_train, Y_train)

LogisticRegression()

In [22]:
inputFeatures = [100,1,22,0,1]
infProb = clf.predict_proba([inputFeatures])[0][1]

In [23]:
infProb # Shows prbability having infection based on given data

0.5212729310158397