## Importing Libraries

In [3]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import warnings
warnings.filterwarnings(action = 'ignore')

## Importing Dataset

In [4]:
data = pd.read_csv('D:/New Projects/Prescription Drug Type Prediction/Islander_data.csv')

## Exploratory Data Analysis

In [5]:
data.head()

Unnamed: 0,first_name,last_name,age,Happy_Sad_group,Dosage,Drug,Mem_Score_Before,Mem_Score_After,Diff
0,Bastian,Carrasco,25,H,1,A,63.5,61.2,-2.3
1,Evan,Carrasco,52,S,1,A,41.6,40.7,-0.9
2,Florencia,Carrasco,29,H,1,A,59.7,55.1,-4.6
3,Holly,Carrasco,50,S,1,A,51.7,51.2,-0.5
4,Justin,Carrasco,52,H,1,A,47.0,47.1,0.1


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198 entries, 0 to 197
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   first_name        198 non-null    object 
 1   last_name         198 non-null    object 
 2   age               198 non-null    int64  
 3   Happy_Sad_group   198 non-null    object 
 4   Dosage            198 non-null    int64  
 5   Drug              198 non-null    object 
 6   Mem_Score_Before  198 non-null    float64
 7   Mem_Score_After   198 non-null    float64
 8   Diff              198 non-null    float64
dtypes: float64(3), int64(2), object(4)
memory usage: 14.0+ KB


### Creating Functions for Pre-processing

In [7]:
{column : len(data[column].unique()) for column in data}

{'first_name': 139,
 'last_name': 18,
 'age': 45,
 'Happy_Sad_group': 2,
 'Dosage': 3,
 'Drug': 3,
 'Mem_Score_Before': 162,
 'Mem_Score_After': 151,
 'Diff': 142}

- Binary encode the 'Happy_Sad_group' column because it is an categorical variable
- 'Drug' column is the target variable and also an categorical variable, so we are not going to change it
- Also using OneHotEncoding on the 'first_name' & 'last_name' column in this prediction model because
  the dataset only consists of 198 records,<br> 
  so we cannot get rid of any columns;<br>
  Those columns may or may not contribute to the prediction model but due to lack of records but
  we are going to continue with those records due to lack of data

In [8]:
# this function creates binary values for categorical variables
def onehot_encode(df, column):
    df = df.copy()

    dummies = pd.get_dummies(df[column], prefix = column)

    if len(df[column].unique()) == 2:
        dummies = dummies.drop(dummies.columns[0], axis = 1)

    df = pd.concat([df, dummies], axis = 1)
    df = df.drop(column, axis = 1)

    return df

In [9]:
# this function initiates the above function, splits data into 'X' & 'y', train-test-split and scaling X
def preprocess(df):
    df = df.copy()

    # OneHotEncoding categorical features
    for column in ['first_name', 'last_name', 'Happy_Sad_group']:
        df = onehot_encode(df, column = column)

    # Spliting df into 'X' & 'y'
    X = df.drop('Drug', axis = 1)
    y = df['Drug']

    # Train, test & split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)

    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)                # transform returns a Series, so we are converting into a DataFrame
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)

    return X_train, X_test, y_train, y_test

In [10]:
X_train, X_test, y_train, y_test = preprocess(data)

In [11]:
X_train

Unnamed: 0,age,Dosage,Mem_Score_Before,Mem_Score_After,Diff,first_name_Aaron,first_name_Adam,first_name_Ai,first_name_Akane,first_name_Akira,...,last_name_Lopez,last_name_McCarthy,last_name_Morin,last_name_Novak,last_name_Price,last_name_Rodriguez,last_name_Steiner,last_name_Summers,last_name_Takahashi,Happy_Sad_group_S
124,-0.302247,1.206716,0.249183,-0.151850,-0.594735,0.0,-0.121268,0.0,-0.121268,0.0,...,2.761340,-0.23116,-0.085436,-0.085436,0.0,-0.085436,-0.264135,-0.264135,-0.336011,1.0
97,0.909251,0.025675,1.221038,1.038471,-0.092208,0.0,-0.121268,0.0,-0.121268,0.0,...,-0.362143,-0.23116,-0.085436,-0.085436,0.0,-0.085436,-0.264135,-0.264135,-0.336011,1.0
42,1.428464,0.025675,0.438505,-0.684661,-1.707473,0.0,-0.121268,0.0,-0.121268,0.0,...,-0.362143,-0.23116,-0.085436,-0.085436,0.0,-0.085436,-0.264135,3.785939,-0.336011,1.0
17,-1.167603,-1.155366,0.413262,0.698379,0.518003,0.0,-0.121268,0.0,-0.121268,0.0,...,-0.362143,-0.23116,11.704700,-0.085436,0.0,-0.085436,-0.264135,-0.264135,-0.336011,-1.0
5,-0.215712,-1.155366,0.564720,-0.117841,-0.989577,0.0,-0.121268,0.0,-0.121268,0.0,...,-0.362143,-0.23116,-0.085436,-0.085436,0.0,-0.085436,-0.264135,-0.264135,-0.336011,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,-0.215712,-1.155366,-0.665875,-0.752679,-0.244761,0.0,-0.121268,0.0,-0.121268,0.0,...,-0.362143,-0.23116,-0.085436,-0.085436,0.0,-0.085436,-0.264135,-0.264135,-0.336011,1.0
137,1.947677,-1.155366,2.432701,2.370498,0.293661,0.0,-0.121268,0.0,-0.121268,0.0,...,-0.362143,-0.23116,-0.085436,-0.085436,0.0,-0.085436,-0.264135,-0.264135,-0.336011,-1.0
72,1.082322,-1.155366,0.552099,0.069210,-0.675498,0.0,-0.121268,0.0,-0.121268,0.0,...,-0.362143,-0.23116,-0.085436,-0.085436,0.0,-0.085436,-0.264135,-0.264135,-0.336011,1.0
140,-0.475319,-1.155366,-0.533349,-1.115444,-1.007525,0.0,-0.121268,0.0,-0.121268,0.0,...,-0.362143,-0.23116,-0.085436,-0.085436,0.0,-0.085436,-0.264135,-0.264135,-0.336011,1.0


## Training models

In [12]:
# Creating a dictionary of the models that we prefer and performing it on the train data
models = {
    "                   Logistic Regression": LogisticRegression(),
    "                   K-Nearest Neighbors": KNeighborsClassifier(),
    "                         Decision Tree": DecisionTreeClassifier(),
    "Support Vector Machine (Linear Kernel)": LinearSVC(),
    "   Support Vector Machine (RBF Kernel)": SVC(),
    "                        Neural Network": MLPClassifier(),
    "                         Random Forest": RandomForestClassifier(),
    "                     Gradient Boosting": GradientBoostingClassifier(),
    "                               XGBoost": XGBClassifier(eval_metric='mlogloss'),
    "                              LightGBM": LGBMClassifier(),
    "                              CatBoost": CatBoostClassifier(verbose=0)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained")

                   Logistic Regression trained
                   K-Nearest Neighbors trained
                         Decision Tree trained
Support Vector Machine (Linear Kernel) trained
   Support Vector Machine (RBF Kernel) trained
                        Neural Network trained
                         Random Forest trained
                     Gradient Boosting trained
                               XGBoost trained
                              LightGBM trained
                              CatBoost trained


## Results with accuracy

In [13]:
# After training the model, printing the corresponding model's accuracy
for name, model in models.items():
    acc = print(name + ": {:.2f}%".format(model.score(X_test, y_test) * 100))

                   Logistic Regression: 41.67%
                   K-Nearest Neighbors: 36.67%
                         Decision Tree: 51.67%
Support Vector Machine (Linear Kernel): 40.00%
   Support Vector Machine (RBF Kernel): 43.33%
                        Neural Network: 41.67%
                         Random Forest: 46.67%
                     Gradient Boosting: 50.00%
                               XGBoost: 38.33%
                              LightGBM: 38.33%
                              CatBoost: 48.33%
