In [1]:
!pip install pandas scikit-learn tpot numpy


Collecting tpot
  Downloading TPOT-0.12.2-py3-none-any.whl.metadata (2.0 kB)
Collecting deap>=1.2 (from tpot)
  Downloading deap-1.4.3-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collecting update-checker>=0.16 (from tpot)
  Downloading update_checker-0.18.0-py3-none-any.whl.metadata (2.3 kB)
Collecting stopit>=1.1.1 (from tpot)
  Downloading stopit-1.1.2.tar.gz (18 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting xgboost>=1.1.0 (from tpot)
  Downloading xgboost-3.0.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading TPOT-0.12.2-py3-none-any.whl (87 kB)
Downloading deap-1.4.3-cp312-cp312-win_amd64.whl (109 kB)
Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Downloading xgboost-3.0.2-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
    --------------------------------------- 2.6/150.0 MB 13.7 MB/s eta 0:00:11
   - -------------------------------------

In [9]:
# Task 2: Load the dataset
import numpy as np
import pandas as pd

# Load the data
transfusion = pd.read_csv("C:/Users/andre/Downloads/Data Analyst/Give Life_ Predict Blood Donations/datasets/transfusion.data")

# Task 3: Inspect the DataFrame's structure
print(transfusion.info())

# Task 4: Rename column
transfusion.rename(columns={"whether he/she donated blood in March 2007": "target"}, inplace=True)
print(transfusion.head(2))

# Task 5: Print target incidence
print(transfusion['target'].value_counts(normalize=True).round(3))

# Task 6: Split the DataFrame
from sklearn.model_selection import train_test_split

X = transfusion.drop(columns='target')
y = transfusion['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=42
)

print(X_train.head(2))

# Task 7: Use TPOT to find the best pipeline
from tpot import TPOTClassifier
from sklearn.metrics import roc_auc_score

tpot = TPOTClassifier(scoring='roc_auc', random_state=42, verbosity=2, max_time_mins=5, config_dict='TPOT light')
tpot.fit(X_train, y_train)

# Predict and score
tpot_preds = tpot.predict_proba(X_test)[:, 1]
tpot_auc_score = roc_auc_score(y_test, tpot_preds)
print("TPOT AUC Score:", round(tpot_auc_score, 4))

# View pipeline steps
for idx, step in enumerate(tpot.fitted_pipeline_.steps):
    print(f"Step {idx}: {step}")

# Task 8: Check variance
print("Feature variance before normalization:\n", X_train.var().round(3))

# Task 9: Correct for high variance
X_train_normed = X_train.copy()
X_test_normed = X_test.copy()

# Identify column with highest variance
col_to_normalize = X_train.var().idxmax()

# Log normalize and replace
for dataset in [X_train_normed, X_test_normed]:
    dataset[col_to_normalize + '_log'] = dataset[col_to_normalize].apply(lambda x: np.log1p(x))
    dataset.drop(columns=col_to_normalize, inplace=True)

print("Feature variance after normalization:\n", X_train_normed.var().round(3))

# Task 10: Train logistic regression
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(X_train_normed, y_train)

logreg_preds = logreg.predict_proba(X_test_normed)[:, 1]
logreg_auc_score = roc_auc_score(y_test, logreg_preds)
print("Logistic Regression AUC Score:", round(logreg_auc_score, 4))

# Task 11: Sort models by AUC
from operator import itemgetter

models = [('TPOT', tpot_auc_score), ('Logistic Regression', logreg_auc_score)]
models_sorted = sorted(models, key=itemgetter(1), reverse=True)

print("Models sorted by AUC:")
for model_name, score in models_sorted:
    print(f"{model_name}: {round(score, 4)}")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 5 columns):
 #   Column                                      Non-Null Count  Dtype
---  ------                                      --------------  -----
 0   Recency (months)                            748 non-null    int64
 1   Frequency (times)                           748 non-null    int64
 2   Monetary (c.c. blood)                       748 non-null    int64
 3   Time (months)                               748 non-null    int64
 4   whether he/she donated blood in March 2007  748 non-null    int64
dtypes: int64(5)
memory usage: 29.3 KB
None
   Recency (months)  Frequency (times)  Monetary (c.c. blood)  Time (months)  \
0                 2                 50                  12500             98   
1                 0                 13                   3250             28   

   target  
0       1  
1       1  
target
0    0.762
1    0.238
Name: proportion, dtype: float64
     Recency (mo

Version 0.12.2 of tpot is outdated. Version 1.0.0 was released Wednesday February 26, 2025.


Optimization Progress:   0%|          | 0/100 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.7473626652012425

Generation 2 - Current best internal CV score: 0.7473626652012425

Generation 3 - Current best internal CV score: 0.7473626652012425

Generation 4 - Current best internal CV score: 0.7473626652012425

Generation 5 - Current best internal CV score: 0.7473626652012425

Generation 6 - Current best internal CV score: 0.7474575280320834

Generation 7 - Current best internal CV score: 0.750339035236436

Generation 8 - Current best internal CV score: 0.750339035236436

Generation 9 - Current best internal CV score: 0.750339035236436

Generation 10 - Current best internal CV score: 0.7528133415958314

Generation 11 - Current best internal CV score: 0.7543738624449979

Generation 12 - Current best internal CV score: 0.7543738624449979

Generation 13 - Current best internal CV score: 0.7543738624449979

Generation 14 - Current best internal CV score: 0.7543738624449979

Generation 15 - Current best internal CV score: 0.754373862