In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


# Load the CSV file into a pandas dataframe
df = pd.read_csv('merged_data.csv')

In [3]:
df.head()

Unnamed: 0,Vehicle ID,Latitude,Longitude,Direction,Wrong side
0,veh1,1261.905756,4150.712113,196.200933,False
1,veh2,1253.942641,4123.330153,196.246772,False
2,veh3,1237.985441,4067.379361,196.794343,False
3,veh4,1229.605418,4039.955519,196.980608,False
4,veh5,1213.439609,3984.397477,196.223493,False


In [4]:
df.tail()


Unnamed: 0,Vehicle ID,Latitude,Longitude,Direction,Wrong side
392,veh393,873.251779,2642.674699,201.492786,False
393,veh394,82.243732,1109.256211,10.219933,False
394,veh395,30.589476,290.827534,347.77676,False
395,veh396,679.624029,2247.815734,28.83143,False
396,veh397,862.252103,2615.170999,202.224217,False


In [5]:
df.shape

(397, 5)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397 entries, 0 to 396
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Vehicle ID  397 non-null    object 
 1   Latitude    397 non-null    float64
 2   Longitude   397 non-null    float64
 3   Direction   397 non-null    float64
 4   Wrong side  397 non-null    bool   
dtypes: bool(1), float64(3), object(1)
memory usage: 12.9+ KB


In [7]:
df.isnull().sum()

Vehicle ID    0
Latitude      0
Longitude     0
Direction     0
Wrong side    0
dtype: int64

In [8]:
df.describe()

Unnamed: 0,Latitude,Longitude,Direction
count,397.0,397.0,397.0
mean,499.469113,1843.166786,190.612192
std,481.310873,1510.625636,108.113831
min,0.348928,4.926435,1.359437
25%,44.896508,401.566452,168.962624
50%,299.444696,1775.580531,195.168816
75%,977.937414,2976.789657,211.812019
max,1469.66523,6193.322289,359.23186


In [9]:
df['Wrong side'].value_counts()

False    210
True     187
Name: Wrong side, dtype: int64

Splitting the data

In [31]:
X = df[['Latitude', 'Longitude', 'Direction']]
y = df['Wrong side']
print(X)


        Latitude    Longitude   Direction
0    1261.905756  4150.712113  196.200933
1    1253.942641  4123.330153  196.246772
2    1237.985441  4067.379361  196.794343
3    1229.605418  4039.955519  196.980608
4    1213.439609  3984.397477  196.223493
..           ...          ...         ...
392   873.251779  2642.674699  201.492786
393    82.243732  1109.256211   10.219933
394    30.589476   290.827534  347.776760
395   679.624029  2247.815734   28.831430
396   862.252103  2615.170999  202.224217

[397 rows x 3 columns]


In [32]:
print(Y)

0      False
1      False
2      False
3      False
4      False
       ...  
392    False
393    False
394    False
395    False
396    False
Name: Wrong side, Length: 397, dtype: bool


Splitting Data

In [37]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, random_state = 42)
print(X.shape, X_train.shape, X_test.shape)

(397, 3) (317, 3) (80, 3)


In [34]:
print(X_train.head())

        Latitude    Longitude   Direction    0    1    2    3    4    5    6  \
262  1357.734256  5915.463164  347.423417  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
113   421.489905  1896.452803  222.485882  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
176    14.576950   388.534266  348.919858  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
63     31.649539   831.961172   10.766563  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
245  1297.308018  4308.173461   16.235073  0.0  0.0  0.0  0.0  0.0  0.0  0.0   

     ...  387  388  389  390  391  392  393  394  395  396  
262  ...  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
113  ...  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
176  ...  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
63   ...  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
245  ...  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  

[5 rows x 400 columns]


train logistic regression model

In [17]:
model = LogisticRegression()

In [27]:
#training the Logistic regression model with training data
model.fit(X_train, Y_train)



LogisticRegression()

In [26]:
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Accuracy on Training Data: ', training_data_accuracy)

Accuracy on Training Data:  0.889589905362776




In [25]:
#accuracy on the test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)



In [22]:
print('Accuracy on Test Data: ', test_data_accuracy)

Accuracy on Test Data:  0.5625


Build a predictive system

In [24]:
input_data = (62.44836286,207.2821802,167.7957415)

#change the input data to a numpy array
input_data_as_numpy_aray = np.asarray(input_data)

#reshape the numpy array as we are predicting for only one instance
input_data_reshaped = input_data_as_numpy_aray.reshape(1,-1)

prediction = model.predict(input_data_reshaped)

if(prediction[0] == 'Absence'):
    print('The person does not have a Heart Disease.')
else:
    print('The person has Heart Disease.')

ValueError: X has 3 features, but LogisticRegression is expecting 400 features as input.

evaluate performance