In [121]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from sklearn import linear_model
import seaborn as sns
import csv
import pprint
import dateutil
from encodings.aliases import aliases

In [122]:
def getEncodingFormatOfCSV(f):
    alias_values = set(aliases.values())
    for encoding in set(aliases.values()):
        try:
            df=pd.read_csv(f, encoding=encoding)
            return(encoding)
        except:
            pass
f_csv=r'D:\local\Data Science\Hop Health\Sample File\ECG_Normal_PVC_Noise_without time.csv'
df = pd.read_csv(f_csv,encoding=getEncodingFormatOfCSV(f_csv))
#df=pd.read_csv(address,index_col='Title',parse_dates=True,encoding='latin1') #here i am using encoding attribute

In [123]:
#Part-1: Data Exploration and Pre-processing
#1) load the given dataset
print(df)

       Elapsed_Time  ECG_Person1  ECG_Person2  ECG_Person3  ECG_Person4  \
0             0.000       -0.115        0.020        0.020       -0.090   
1             0.002       -0.115        0.015       -0.040       -0.015   
2             0.004       -0.120        0.005       -0.060        0.030   
3             0.006       -0.120        0.000       -0.070        0.060   
4             0.008       -0.120       -0.010       -0.055        0.065   
...             ...          ...          ...          ...          ...   
10795         7.190       -0.130       -1.010        1.625       -0.170   
10796         7.192       -0.105       -1.020        1.580       -0.140   
10797         7.194       -0.125       -1.040        1.555       -0.105   
10798         7.196       -0.165       -1.050        1.525       -0.080   
10799         7.198       -0.175       -1.050        1.535       -0.080   

       ECG_Person5 ECG_Type  
0            0.000   Normal  
1           -0.015   Normal  
2        

In [124]:
#2) print all the column names
df.columns

Index(['Elapsed_Time', 'ECG_Person1', 'ECG_Person2', 'ECG_Person3',
       'ECG_Person4', 'ECG_Person5', 'ECG_Type'],
      dtype='object')

In [125]:
#3) describe the data
df.describe()

Unnamed: 0,Elapsed_Time,ECG_Person1,ECG_Person2,ECG_Person3,ECG_Person4,ECG_Person5
count,10800.0,10800.0,10800.0,10800.0,10800.0,10800.0
mean,3.599,-0.028279,0.026592,-0.039257,0.009605,0.009067
std,2.078557,0.29986,0.296507,0.375888,0.112565,0.121349
min,0.0,-1.43,-1.05,-1.305,-0.465,-0.81
25%,1.7995,-0.1,-0.045,-0.101,-0.045,-0.037
50%,3.599,-0.015,0.01,-0.027,-0.006,0.007
75%,5.3985,0.065,0.105,0.04,0.03,0.053
max,7.198,1.965,0.845,1.84,0.688,0.754


In [126]:
#4) check the null value
df.isnull().sum()

Elapsed_Time    0
ECG_Person1     0
ECG_Person2     0
ECG_Person3     0
ECG_Person4     0
ECG_Person5     0
ECG_Type        0
dtype: int64

In [130]:
one_hot_encoded_data = pd.get_dummies(df, columns = ['ECG_Type'])
print(one_hot_encoded_data)

       Elapsed_Time  ECG_Person1  ECG_Person2  ECG_Person3  ECG_Person4  \
0             0.000       -0.115        0.020        0.020       -0.090   
1             0.002       -0.115        0.015       -0.040       -0.015   
2             0.004       -0.120        0.005       -0.060        0.030   
3             0.006       -0.120        0.000       -0.070        0.060   
4             0.008       -0.120       -0.010       -0.055        0.065   
...             ...          ...          ...          ...          ...   
10795         7.190       -0.130       -1.010        1.625       -0.170   
10796         7.192       -0.105       -1.020        1.580       -0.140   
10797         7.194       -0.125       -1.040        1.555       -0.105   
10798         7.196       -0.165       -1.050        1.525       -0.080   
10799         7.198       -0.175       -1.050        1.535       -0.080   

       ECG_Person5  ECG_Type_Noise  ECG_Type_Normal  ECG_Type_PVC  
0            0.000             

In [129]:
df.head()

Unnamed: 0,Elapsed_Time,ECG_Person1,ECG_Person2,ECG_Person3,ECG_Person4,ECG_Person5,ECG_Type
0,0.0,-0.115,0.02,0.02,-0.09,0.0,Normal
1,0.002,-0.115,0.015,-0.04,-0.015,-0.015,Normal
2,0.004,-0.12,0.005,-0.06,0.03,-0.03,Normal
3,0.006,-0.12,0.0,-0.07,0.06,-0.04,Normal
4,0.008,-0.12,-0.01,-0.055,0.065,-0.055,Normal


In [None]:
#Part-2: Working with Model


In [135]:
#1) Separate feature data from target data
X = df.drop('ECG_Type',axis=1)
y=df['ECG_Type']
y

0        Normal
1        Normal
2        Normal
3        Normal
4        Normal
          ...  
10795     Noise
10796     Noise
10797     Noise
10798     Noise
10799     Noise
Name: ECG_Type, Length: 10800, dtype: object

In [136]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [137]:
from sklearn.linear_model import LogisticRegression

In [138]:
#2) Create a Logistic regression model between Features and target data
model_log_reg = LogisticRegression()
model_log_reg.fit(X_train,y_train)

In [139]:
#3) Display the test score and training score
model_log_reg.score(X_train,y_train)

0.5476851851851852

In [140]:
#3) Display the test score and training score
model_log_reg.score(X_test,y_test)

0.549074074074074

In [141]:
y_pred = model_log_reg.predict(X_test)

In [142]:
from sklearn.metrics import confusion_matrix

In [143]:
#4) Display the Confusion Matrix
confusion_matrix(y_test, y_pred)

array([[421, 154, 157],
       [112, 418, 179],
       [ 82, 290, 347]], dtype=int64)

In [147]:
from sklearn.ensemble import RandomForestClassifier

In [148]:
rfc = RandomForestClassifier(random_state=0)
rfc.fit(X_train, y_train)

In [149]:
y_pred = rfc.predict(X_test)

In [150]:
y_pred

array(['Normal', 'Noise', 'Normal', ..., 'Noise', 'PVC', 'Noise'],
      dtype=object)

In [151]:
from sklearn.metrics import accuracy_score

In [152]:
accuracy_score(y_test,y_pred)

0.9981481481481481