## Final Project

### Inspecting transfusion.data file

In [1]:
import pandas as pd

### Loading the blood donations data

In [2]:
transfusion = pd.read_csv('transfusion.data')

In [3]:
transfusion.head()

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0


### Inspecting the transfusion dataframe

In [4]:
transfusion.describe()

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
count,748.0,748.0,748.0,748.0,748.0
mean,9.506684,5.514706,1378.676471,34.282086,0.237968
std,8.095396,5.839307,1459.826781,24.376714,0.426124
min,0.0,1.0,250.0,2.0,0.0
25%,2.75,2.0,500.0,16.0,0.0
50%,7.0,4.0,1000.0,28.0,0.0
75%,14.0,7.0,1750.0,50.0,0.0
max,74.0,50.0,12500.0,98.0,1.0


In [5]:
transfusion.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 5 columns):
 #   Column                                      Non-Null Count  Dtype
---  ------                                      --------------  -----
 0   Recency (months)                            748 non-null    int64
 1   Frequency (times)                           748 non-null    int64
 2   Monetary (c.c. blood)                       748 non-null    int64
 3   Time (months)                               748 non-null    int64
 4   whether he/she donated blood in March 2007  748 non-null    int64
dtypes: int64(5)
memory usage: 29.3 KB


### Creating target column

In [6]:
transfusion.rename(
    columns = {'whether he/she donated blood in March 2007': 'target'},
    inplace = True
)

In [7]:
transfusion.head()

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),target
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0


### Checking target incidence

In [8]:
transfusion.target.value_counts(normalize = True)

0    0.762032
1    0.237968
Name: target, dtype: float64

### Splitting transfusion into train and test datasets

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
transfusion.drop(columns = 'target'),
transfusion.target,
test_size = 0.25,
random_state = 42,
stratify = transfusion.target)

In [10]:
X_train.head()

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months)
334,16,2,500,16
99,5,7,1750,26
116,2,7,1750,46
661,16,2,500,16
154,2,1,250,2


### Selecting the model using Tpot

In [11]:
pip install tpot

Note: you may need to restart the kernel to use updated packages.


You are using pip version 19.0.3, however version 20.2.3 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [12]:
from tpot import TPOTClassifier
from sklearn.metrics import roc_auc_score



In [13]:
#instantiate classifier
tpot = TPOTClassifier(
generations = 5,
population_size=20,
verbosity = 2,
scoring = 'roc_auc',
random_state = 42,
disable_update_check = True,
config_dict = 'TPOT light')

In [14]:
tpot.fit(X_train, y_train)

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=120.0, style=ProgressStyle(de…


Generation 1 - Current best internal CV score: 0.7422459184429089
Generation 2 - Current best internal CV score: 0.7422459184429089
Generation 3 - Current best internal CV score: 0.7422459184429089
Generation 4 - Current best internal CV score: 0.7422459184429089
Generation 5 - Current best internal CV score: 0.7423330644124079
Best pipeline: LogisticRegression(input_matrix, C=0.1, dual=False, penalty=l2)


TPOTClassifier(config_dict='TPOT light', disable_update_check=True,
               generations=5,
               log_file=<ipykernel.iostream.OutStream object at 0x05523B30>,
               population_size=20, random_state=42, scoring='roc_auc',
               verbosity=2)

In [15]:
#AUC score for tpot model
tpot_auc_score = roc_auc_score(y_test, tpot.predict_proba(X_test)[:,1])
print(f'\nAUC score: {tpot_auc_score:.4f}')


AUC score: 0.7853


In [16]:
#pipeline steps
print('\nBest pipeline steps:', end ='\n')
for idx, (name, transform) in enumerate(tpot.fitted_pipeline_.steps, start=1):
    print(f'{idx}.{transform}')


Best pipeline steps:
1.LogisticRegression(C=0.1, random_state=42)


### Checking the Variance

In [20]:
X_train.var()

Recency (months)         6.692902e+01
Frequency (times)        3.382982e+01
Monetary (c.c. blood)    2.114364e+06
Time (months)            6.111466e+02
dtype: float64

### Log Normalization

In [21]:
import numpy as np

In [22]:
X_train_norm = X_train.copy()
X_test_norm = X_test.copy()

In [23]:
norm_col = 'Monetary (c.c. blood)'

In [25]:
for df_ in [X_train_norm, X_test_norm]:
    #add log normalized column
    df_['monetary_log'] = np.log(df_[norm_col])
    
    #drop og column
    df_.drop(columns = norm_col, inplace = True)

In [26]:
X_train_norm.var().round(3)

Recency (months)      66.929
Frequency (times)     33.830
Time (months)        611.147
monetary_log           0.837
dtype: float64

### Training the linear regression model

In [29]:
from sklearn.linear_model import LogisticRegression

In [30]:
clf = LogisticRegression(
solver = 'liblinear',
random_state = 42)

In [31]:
#train
clf.fit(X_train_norm, y_train)

LogisticRegression(random_state=42, solver='liblinear')

In [35]:
#auc score
clf_score = roc_auc_score(y_test, clf.predict_proba(X_test_norm)[:,1])
print('AUC_score:', clf_score)

AUC_score: 0.7890972663699937


In [36]:
#compare models
from operator import itemgetter

sorted(
[('tpot', tpot_auc_score), ('clf', clf_score)],
key = itemgetter(1),
reverse=True)

[('clf', 0.7890972663699937), ('tpot', 0.7852828989192625)]