### Import Necessary Libraries

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

%matplotlib inline

### Compile and Read Data

In [6]:
haberman_set = pd.read_csv('haberman.csv', names=['Age', 'Years', 'Number of Positive Axillary Nodes', 'Survival Status'])
haberman_set.head()

Unnamed: 0,Age,Years,Number of Positive Axillary Nodes,Survival Status
0,30,64,1,1
1,30,62,3,1
2,30,65,0,1
3,31,59,2,1
4,31,65,4,1


### Data Cleaning / Preprocessing

In [10]:
#check for null values
haberman_set.isnull().values.any()

False

### Modeling and Training (Logistic Regression)

In [12]:
#defining independent and dependent variables
x = haberman_set.drop(['Survival Status'],axis=1).values
y = haberman_set['Survival Status'].values

In [13]:
x.reshape(-1,1)
y.reshape(-1,1)

array([[1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [2],
       [2],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [2],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [2],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [2],
       [2],
       [2],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [2],
       [2],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [2],
       [2],
       [2],
       [2],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [2],
       [2],
       [2],
       [1],
       [1],
       [1],
       [1],
       [2],
       [2],
       [2],
    

In [61]:
#split data into training and testing sets
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=.9,random_state=0)

In [62]:
logreg = LogisticRegression()

In [63]:
logreg.fit(x_train,y_train)

In [64]:
logreg.predict(x_test)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [65]:
#check accuracy of model [0 - 1] (higher score => better model). also, larger test sample size => higher accuracy???
logreg.score(x_test,y_test)

0.717391304347826

In [66]:
#predict probability of output being 1 or 2
logreg.predict_proba(x_test)

array([[0.86339913, 0.13660087],
       [0.93363536, 0.06636464],
       [0.90552799, 0.09447201],
       [0.91667359, 0.08332641],
       [0.92224647, 0.07775353],
       [0.95614621, 0.04385379],
       [0.89054153, 0.10945847],
       [0.90132595, 0.09867405],
       [0.84152326, 0.15847674],
       [0.8932943 , 0.1067057 ],
       [0.8373111 , 0.1626889 ],
       [0.94939504, 0.05060496],
       [0.91004967, 0.08995033],
       [0.90960843, 0.09039157],
       [0.8787382 , 0.1212618 ],
       [0.91657083, 0.08342917],
       [0.929638  , 0.070362  ],
       [0.95803327, 0.04196673],
       [0.91133201, 0.08866799],
       [0.96859361, 0.03140639],
       [0.96945078, 0.03054922],
       [0.93708751, 0.06291249],
       [0.91657083, 0.08342917],
       [0.95465067, 0.04534933],
       [0.9114409 , 0.0885591 ],
       [0.92128213, 0.07871787],
       [0.93810568, 0.06189432],
       [0.94556941, 0.05443059],
       [0.96142394, 0.03857606],
       [0.94256971, 0.05743029],
       [0.