In [115]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from sklearn.model_selection import  train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
import seaborn as sns
import os

In [94]:
transfusion = pd.read_csv('/content/transfusion.data')
transfusion.head(5)

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0


In [95]:
transfusion.tail(5)

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
743,23,2,500,38,0
744,21,2,500,52,0
745,23,3,750,62,0
746,39,1,250,39,0
747,72,1,250,72,0


In [96]:
transfusion.describe()

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
count,748.0,748.0,748.0,748.0,748.0
mean,9.506684,5.514706,1378.676471,34.282086,0.237968
std,8.095396,5.839307,1459.826781,24.376714,0.426124
min,0.0,1.0,250.0,2.0,0.0
25%,2.75,2.0,500.0,16.0,0.0
50%,7.0,4.0,1000.0,28.0,0.0
75%,14.0,7.0,1750.0,50.0,0.0
max,74.0,50.0,12500.0,98.0,1.0


In [97]:
transfusion.shape

(748, 5)

In [98]:
transfusion.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 5 columns):
 #   Column                                      Non-Null Count  Dtype
---  ------                                      --------------  -----
 0   Recency (months)                            748 non-null    int64
 1   Frequency (times)                           748 non-null    int64
 2   Monetary (c.c. blood)                       748 non-null    int64
 3   Time (months)                               748 non-null    int64
 4   whether he/she donated blood in March 2007  748 non-null    int64
dtypes: int64(5)
memory usage: 29.3 KB


In [99]:
transfusion.columns

Index(['Recency (months)', 'Frequency (times)', 'Monetary (c.c. blood)',
       'Time (months)', 'whether he/she donated blood in March 2007'],
      dtype='object')

In [100]:
# Rename target column as 'target' for brevity 
transfusion.rename(columns={'whether he/she donated blood in March 2007': 'target'},
                   inplace=True)
transfusion.head(10)

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),target
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0
5,4,4,1000,4,0
6,2,7,1750,14,1
7,1,12,3000,35,0
8,2,9,2250,22,1
9,5,46,11500,98,1


In [101]:
 transfusion.target.value_counts(normalize=True).round(3)

0    0.762
1    0.238
Name: target, dtype: float64

In [102]:
X_train, X_test, y_train,y_test= train_test_split(
    transfusion.drop(columns='target'),
    transfusion.target,
    test_size=0.25,
    random_state=42,
    stratify=transfusion.target
)

X_train.head(2)

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months)
334,16,2,500,16
99,5,7,1750,26


Selecting model using TPOT


In [103]:
from tpot import TPOTClassifier

tpot = TPOTClassifier(generations=5,population_size=20,verbosity=2,
scoring='roc_auc',random_state=42,disable_update_check=True,config_dict='TPOT light')

tpot.fit(X_train, y_train)

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=120.0, style=ProgressStyle(de…


Generation 1 - Current best internal CV score: 0.7422459184429089

Generation 2 - Current best internal CV score: 0.7422459184429089

Generation 3 - Current best internal CV score: 0.7422459184429089

Generation 4 - Current best internal CV score: 0.7422459184429089

Generation 5 - Current best internal CV score: 0.7456308339276876

Best pipeline: MultinomialNB(Normalizer(input_matrix, norm=l2), alpha=0.001, fit_prior=True)


TPOTClassifier(config_dict='TPOT light', crossover_rate=0.1, cv=5,
               disable_update_check=True, early_stop=None, generations=5,
               log_file=None, max_eval_time_mins=5, max_time_mins=None,
               memory=None, mutation_rate=0.9, n_jobs=1, offspring_size=None,
               periodic_checkpoint_folder=None, population_size=20,
               random_state=42, scoring='roc_auc', subsample=1.0, template=None,
               use_dask=False, verbosity=2, warm_start=False)

In [104]:
# AUC score for tpot model
tpot_auc_score = roc_auc_score(y_test, tpot.predict_proba(X_test)[:, 1])
print(f'\nAUC score: {tpot_auc_score:.4f}')


AUC score: 0.7637


In [105]:
# Print best pipeline steps
print('\nBest pipeline steps:', end='\n')
for idx, (name, transform) in enumerate(tpot.fitted_pipeline_.steps, start=1):
    # Print idx and transform
    print(f'{idx}. {transform}')


Best pipeline steps:
1. Normalizer(copy=True, norm='l2')
2. MultinomialNB(alpha=0.001, class_prior=None, fit_prior=True)


In [106]:
# X_train's variance, rounding the output to 3 decimal places
X_train.var().round(3)

Recency (months)              66.929
Frequency (times)             33.830
Monetary (c.c. blood)    2114363.700
Time (months)                611.147
dtype: float64

In [107]:
# Copy X_train and X_test into X_train_normed and X_test_normed
X_train_normed,X_test_normed = X_train.copy(), X_test.copy()

# Specify which column to normalize
col_to_normalize = 'Monetary (c.c. blood)'

# Log normalization
for df_ in [X_train_normed, X_test_normed]:
    # Add log normalized column
    df_['monetary_log'] = np.log(df_[col_to_normalize])
    # Drop the original column
    df_.drop(columns=col_to_normalize, inplace=True)

# Check the variance for X_train_normed.
X_train_normed.var().round(3)

Recency (months)      66.929
Frequency (times)     33.830
Time (months)        611.147
monetary_log           0.837
dtype: float64

logistic regression:

In [108]:
logreg =LogisticRegression(solver='liblinear',random_state=42)

In [109]:
logreg.fit(X_train_normed, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [110]:
logreg_auc_score = roc_auc_score(y_test, logreg.predict_proba(X_test_normed)[:, 1])
print((logreg_auc_score))

0.7890178003814368


In [111]:
from sklearn.ensemble import RandomForestClassifier

rfc= RandomForestClassifier(n_estimators=700).fit(X_train_normed,y_train)
predictionforest = rfc.predict(X_test)

In [112]:
rfc.score(X_train_normed, y_train)

0.9411764705882353

In [113]:
rfc_score = roc_auc_score(y_test, rfc.predict_proba(X_test_normed)[:, 1])
print((rfc_score))

0.7166242848061029


In [114]:
from operator import itemgetter

# Sort models based on their AUC score from highest to lowest
sorted([('tpot', tpot_auc_score), ('logreg', logreg_auc_score),('rfc', rfc_score)],key=itemgetter(1),
    reverse=True)

[('logreg', 0.7890178003814368),
 ('tpot', 0.7637476160203432),
 ('rfc', 0.7166242848061029)]