## 💊 Prescription Drug Type Prediction

Given *data about subjects' performances on a memory test*, let's try to predict which **prescription drug** a given subject ingested.

We will use a variety of classification models to make our predictions.

Data source: https://www.kaggle.com/datasets/steveahn/memory-test-on-drugged-islanders-data

### Importing Libraries

In [22]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import warnings
warnings.simplefilter(action='ignore')

In [6]:
data = pd.read_csv('Islander_data.csv')
data

Unnamed: 0,first_name,last_name,age,Happy_Sad_group,Dosage,Drug,Mem_Score_Before,Mem_Score_After,Diff
0,Bastian,Carrasco,25,H,1,A,63.5,61.2,-2.3
1,Evan,Carrasco,52,S,1,A,41.6,40.7,-0.9
2,Florencia,Carrasco,29,H,1,A,59.7,55.1,-4.6
3,Holly,Carrasco,50,S,1,A,51.7,51.2,-0.5
4,Justin,Carrasco,52,H,1,A,47.0,47.1,0.1
...,...,...,...,...,...,...,...,...,...
193,Jacob,Novak,52,H,3,T,71.3,74.3,3.0
194,Teo,Steiner,41,S,3,T,72.5,70.4,-2.1
195,Alexander,Takahashi,54,S,3,T,30.8,33.1,2.3
196,Alexandere,Takahashi,40,H,3,T,53.6,53.8,0.2


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198 entries, 0 to 197
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   first_name        198 non-null    object 
 1   last_name         198 non-null    object 
 2   age               198 non-null    int64  
 3   Happy_Sad_group   198 non-null    object 
 4   Dosage            198 non-null    int64  
 5   Drug              198 non-null    object 
 6   Mem_Score_Before  198 non-null    float64
 7   Mem_Score_After   198 non-null    float64
 8   Diff              198 non-null    float64
dtypes: float64(3), int64(2), object(4)
memory usage: 14.0+ KB


### Preprocessing

In [8]:
df = data.copy()

In [9]:
{column: len(df[column].unique()) for column in df.columns}

{'first_name': 139,
 'last_name': 18,
 'age': 45,
 'Happy_Sad_group': 2,
 'Dosage': 3,
 'Drug': 3,
 'Mem_Score_Before': 162,
 'Mem_Score_After': 151,
 'Diff': 142}

In [10]:
def onehot_encode(df, column):
    df = df.copy()
    dummies = pd.get_dummies(df[column], prefix=column)
    if len(df[column].unique()) == 2:
        dummies = dummies.drop(dummies.columns[0], axis=1)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    return df

In [11]:
# One-hot encode categorical features
for column in ['first_name', 'last_name', 'Happy_Sad_group']:
    df = onehot_encode(df, column=column)

In [12]:
df

Unnamed: 0,age,Dosage,Drug,Mem_Score_Before,Mem_Score_After,Diff,first_name_Aaron,first_name_Adam,first_name_Ai,first_name_Akane,...,last_name_Lopez,last_name_McCarthy,last_name_Morin,last_name_Novak,last_name_Price,last_name_Rodriguez,last_name_Steiner,last_name_Summers,last_name_Takahashi,Happy_Sad_group_S
0,25,1,A,63.5,61.2,-2.3,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,52,1,A,41.6,40.7,-0.9,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
2,29,1,A,59.7,55.1,-4.6,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,50,1,A,51.7,51.2,-0.5,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
4,52,1,A,47.0,47.1,0.1,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193,52,3,T,71.3,74.3,3.0,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
194,41,3,T,72.5,70.4,-2.1,False,False,False,False,...,False,False,False,False,False,False,True,False,False,True
195,54,3,T,30.8,33.1,2.3,False,False,False,False,...,False,False,False,False,False,False,False,False,True,True
196,40,3,T,53.6,53.8,0.2,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False


In [13]:
# Split df into X and y
y = df['Drug'].copy()
X = df.drop('Drug', axis=1)

In [14]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=21)

In [15]:
X_train.shape, X_test.shape

((138, 163), (60, 163))

In [16]:
# Scale X
scaler = StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)

In [17]:
X_train

Unnamed: 0,age,Dosage,Mem_Score_Before,Mem_Score_After,Diff,first_name_Aaron,first_name_Adam,first_name_Ai,first_name_Akane,first_name_Akira,...,last_name_Lopez,last_name_McCarthy,last_name_Morin,last_name_Novak,last_name_Price,last_name_Rodriguez,last_name_Steiner,last_name_Summers,last_name_Takahashi,Happy_Sad_group_S
83,-0.998665,-1.261040,1.050460,1.628206,1.198270,-0.085436,-0.085436,0.0,-0.121268,-0.085436,...,-0.32249,-0.23116,0.0,-0.121268,-0.085436,-0.121268,3.785939,-0.172774,-0.322490,1.044466
44,-0.998665,-0.035522,0.210036,0.233740,0.086473,-0.085436,-0.085436,0.0,-0.121268,-0.085436,...,-0.32249,-0.23116,0.0,-0.121268,-0.085436,-0.121268,-0.264135,-0.172774,3.100868,-0.957427
115,0.807909,1.189995,0.331930,-0.366270,-1.090179,-0.085436,-0.085436,0.0,-0.121268,-0.085436,...,-0.32249,-0.23116,0.0,-0.121268,-0.085436,-0.121268,-0.264135,-0.172774,-0.322490,-0.957427
182,2.012292,1.189995,0.614209,0.233740,-0.497220,-0.085436,-0.085436,0.0,-0.121268,-0.085436,...,-0.32249,-0.23116,0.0,-0.121268,-0.085436,-0.121268,-0.264135,-0.172774,-0.322490,1.044466
157,3.732839,-0.035522,-0.861343,-0.899611,-0.256331,-0.085436,-0.085436,0.0,-0.121268,-0.085436,...,-0.32249,-0.23116,0.0,-0.121268,-0.085436,-0.121268,-0.264135,-0.172774,-0.322490,-0.957427
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120,-1.084693,1.189995,2.051269,1.622650,-0.256331,-0.085436,-0.085436,0.0,-0.121268,-0.085436,...,-0.32249,-0.23116,0.0,-0.121268,-0.085436,-0.121268,-0.264135,-0.172774,-0.322490,1.044466
112,-0.224419,1.189995,-0.874174,-1.249617,-0.821494,-0.085436,-0.085436,0.0,-0.121268,-0.085436,...,-0.32249,-0.23116,0.0,-0.121268,-0.085436,-0.121268,-0.264135,-0.172774,-0.322490,-0.957427
48,-0.396474,1.189995,0.473069,2.961560,4.255712,-0.085436,-0.085436,0.0,-0.121268,-0.085436,...,-0.32249,-0.23116,0.0,-0.121268,-0.085436,-0.121268,-0.264135,-0.172774,-0.322490,-0.957427
4,1.065991,-1.261040,-0.681711,-0.755165,-0.274861,-0.085436,-0.085436,0.0,-0.121268,-0.085436,...,-0.32249,-0.23116,0.0,-0.121268,-0.085436,-0.121268,-0.264135,-0.172774,-0.322490,-0.957427


In [18]:
y_train.value_counts()  # pretty even class distribution

Drug
S    47
A    46
T    45
Name: count, dtype: int64

In [26]:
y_train = y_train.apply(lambda x: 0 if x == 'S' else (1 if x == 'A' else 2)) 
y_test = y_test.apply(lambda x: 0 if x == 'S' else (1 if x == 'A' else 2))

### Training

In [27]:
models = {
    "                   Logistic Regression": LogisticRegression(),
    "                   K-Nearest Neighbors": KNeighborsClassifier(),
    "                         Decision Tree": DecisionTreeClassifier(),
    "Support Vector Machine (Linear Kernel)": LinearSVC(),
    "   Support Vector Machine (RBF Kernel)": SVC(),
    "                        Neural Network": MLPClassifier(),
    "                         Random Forest": RandomForestClassifier(),
    "                     Gradient Boosting": GradientBoostingClassifier(),
    "                               XGBoost": XGBClassifier(eval_metric='mlogloss'),
    "                              LightGBM": LGBMClassifier(),
    "                              CatBoost": CatBoostClassifier(verbose=0)
}

In [28]:
for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained.")

                   Logistic Regression trained.
                   K-Nearest Neighbors trained.
                         Decision Tree trained.
Support Vector Machine (Linear Kernel) trained.
   Support Vector Machine (RBF Kernel) trained.
                        Neural Network trained.
                         Random Forest trained.
                     Gradient Boosting trained.
                               XGBoost trained.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000057 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 174
[LightGBM] [Info] Number of data points in the train set: 138, number of used features: 8
[LightGBM] [Info] Start training from score -1.077106
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.120591
                              LightGBM trained.
         

### Results

In [29]:
for name, model in models.items():
    print(name + ": {:.2f}%".format(model.score(X_test, y_test)*100))

                   Logistic Regression: 43.33%
                   K-Nearest Neighbors: 31.67%
                         Decision Tree: 41.67%
Support Vector Machine (Linear Kernel): 33.33%
   Support Vector Machine (RBF Kernel): 38.33%
                        Neural Network: 35.00%
                         Random Forest: 35.00%
                     Gradient Boosting: 38.33%
                               XGBoost: 40.00%
                              LightGBM: 36.67%
                              CatBoost: 41.67%
