## Initialization

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler

In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [4]:
model_accuracies = {'KNN':1, 'LogReg':1, 'DT':1, 'RF':1, 'NB':1, 'LinearSVC':1, 'KernelSVC':1}

## Importing the Data

In [6]:
df = pd.read_csv('transfusion.data')
df.shape

(748, 5)

In [7]:
df.head()

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0


## Create X and Y

In [8]:
Y = df.iloc[:, 4].values
Y.shape

(748,)

In [9]:
Y

array([1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0,

In [10]:
X = df.iloc[:, 0:4].values
X.shape

(748, 4)

In [11]:
X

array([[    2,    50, 12500,    98],
       [    0,    13,  3250,    28],
       [    1,    16,  4000,    35],
       ..., 
       [   23,     3,   750,    62],
       [   39,     1,   250,    39],
       [   72,     1,   250,    72]], dtype=int64)

## Preprocess the Data

In [12]:
sc_X = StandardScaler()

In [13]:
X = sc_X.fit_transform(X)



In [14]:
X

array([[-0.92789873,  7.62334626,  7.62334626,  2.61563344],
       [-1.17511806,  1.28273826,  1.28273826, -0.2578809 ],
       [-1.0515084 ,  1.79684161,  1.79684161,  0.02947053],
       ..., 
       [ 1.66790417, -0.43093957, -0.43093957,  1.13782607],
       [ 3.64565877, -0.77367514, -0.77367514,  0.19367135],
       [ 7.72477762, -0.77367514, -0.77367514,  1.54832812]])

## Create Train and Test data

In [15]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 4)

In [16]:
X_train.shape

(598, 4)

In [17]:
X_test.shape

(150, 4)

In [18]:
Y_train.shape

(598,)

In [19]:
Y_test.shape

(150,)

In [20]:
pd.DataFrame(pd.DataFrame(Y_train)[0].value_counts())

Unnamed: 0,0
0,457
1,141


In [21]:
pd.DataFrame(pd.DataFrame(Y_test)[0].value_counts())

Unnamed: 0,0
0,113
1,37


## Decision Tree Classifier

In [22]:
clf_dt = DecisionTreeClassifier(criterion = 'entropy')

In [23]:
clf_dt.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [24]:
Y_pred_dt = clf_dt.predict(X_test)

In [25]:
cm_dt = confusion_matrix(Y_pred_dt, Y_test)
cm_dt

array([[92, 25],
       [21, 12]], dtype=int64)

## Random Forest Classifier

In [26]:
clf_rf = RandomForestClassifier(n_estimators = 10, criterion = 'entropy')

In [27]:
clf_rf.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [28]:
Y_pred_rf = clf_rf.predict(X_test)

In [29]:
cm_rf = confusion_matrix(Y_pred_rf, Y_test)
cm_rf

array([[99, 25],
       [14, 12]], dtype=int64)

## Naive Bayes Classifier

In [30]:
clf_nb = GaussianNB()

In [31]:
clf_nb.fit(X_train, Y_train)

GaussianNB(priors=None)

In [32]:
Y_pred_nb = clf_nb.predict(X_test)

In [33]:
cm_nb = confusion_matrix(Y_pred_nb, Y_test)
cm_nb

array([[110,  32],
       [  3,   5]], dtype=int64)

## KNN Classifier

In [34]:
clf_knn = KNeighborsClassifier(n_neighbors = 5)

In [35]:
clf_knn.fit(X_train, Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [36]:
Y_pred_knn = clf_knn.predict(X_test)

In [37]:
cm_knn = confusion_matrix(Y_pred_knn, Y_test)
cm_knn

array([[104,  26],
       [  9,  11]], dtype=int64)

## Logistic Regression

In [38]:
clf_lr = LogisticRegression()

In [39]:
clf_lr.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [40]:
Y_pred_lr = clf_lr.predict(X_test)

In [41]:
cm_lr = confusion_matrix(Y_pred_lr, Y_test)
cm_lr

array([[112,  35],
       [  1,   2]], dtype=int64)

## SVC Linear

In [42]:
clf_lsvc = SVC(kernel = "linear")

In [43]:
clf_lsvc.fit(X_train, Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [44]:
Y_pred_lsvc = clf_lsvc.predict(X_test)

In [45]:
cm_lsvc = confusion_matrix(Y_pred_lsvc, Y_test)
cm_lsvc

array([[113,  37],
       [  0,   0]], dtype=int64)

## SVC Kernel

In [46]:
clf_ksvc = SVC(kernel = "rbf")

In [47]:
clf_ksvc.fit(X_train, Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [48]:
Y_pred_ksvc = clf_ksvc.predict(X_test)

In [49]:
cm_ksvc = confusion_matrix(Y_pred_ksvc, Y_test)
cm_ksvc

array([[113,  34],
       [  0,   3]], dtype=int64)

## Check the Accuracy of Various Classifiers

In [50]:
model_accuracies['DT'] = accuracy_score(Y_pred_dt, Y_test)
model_accuracies['KNN'] = accuracy_score(Y_pred_knn, Y_test)
model_accuracies['KernelSVC'] = accuracy_score(Y_pred_ksvc, Y_test)
model_accuracies['LinearSVC'] = accuracy_score(Y_pred_lsvc, Y_test)
model_accuracies['LogReg'] = accuracy_score(Y_pred_lr, Y_test)
model_accuracies['NB'] = accuracy_score(Y_pred_nb, Y_test)
model_accuracies['RF'] = accuracy_score(Y_pred_rf, Y_test)
model_accuracies

{'DT': 0.69333333333333336,
 'KNN': 0.76666666666666672,
 'KernelSVC': 0.77333333333333332,
 'LinearSVC': 0.7533333333333333,
 'LogReg': 0.76000000000000001,
 'NB': 0.76666666666666672,
 'RF': 0.73999999999999999}