# Drug Classification Analysis

This analysis will help predict the best drug to use for a particular patient, based on medical data from 200 patients, by classifying the patient to the drug profile.

## Import Libraries and Data

In [None]:
!pip install jupyter-dash

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting jupyter-dash
  Downloading jupyter_dash-0.4.2-py3-none-any.whl (23 kB)
Collecting ansi2html
  Downloading ansi2html-1.8.0-py3-none-any.whl (16 kB)
Collecting retrying
  Downloading retrying-1.3.3.tar.gz (10 kB)
Collecting dash
  Downloading dash-2.6.1-py3-none-any.whl (9.9 MB)
[K     |████████████████████████████████| 9.9 MB 42.6 MB/s 
Collecting dash-core-components==2.0.0
  Downloading dash_core_components-2.0.0-py3-none-any.whl (3.8 kB)
Collecting dash-table==5.0.0
  Downloading dash_table-5.0.0-py3-none-any.whl (3.9 kB)
Collecting dash-html-components==2.0.0
  Downloading dash_html_components-2.0.0-py3-none-any.whl (4.1 kB)
Collecting flask-compress
  Downloading Flask_Compress-1.12-py3-none-any.whl (7.9 kB)
Collecting brotli
  Downloading Brotli-1.0.9-cp37-cp37m-manylinux1_x86_64.whl (357 kB)
[K     |████████████████████████████████| 357 kB 73.3 MB/s 
Building wheels for

In [None]:
# Import libraries necessary for this project
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import plotly.express as px

# Ensure visualizations can be viewed by all
colorblind_seq = ['#E66100', '#5D3A9B']

# Load the dataset
explore_df = pd.read_csv('drug200.csv')

## Exploratory Data Analysis

In [None]:
explore_df.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,DrugY


In [None]:
explore_df.describe()

Unnamed: 0,Age,Na_to_K
count,200.0,200.0
mean,44.315,16.084485
std,16.544315,7.223956
min,15.0,6.269
25%,31.0,10.4455
50%,45.0,13.9365
75%,58.0,19.38
max,74.0,38.247


In [None]:
explore_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          200 non-null    int64  
 1   Sex          200 non-null    object 
 2   BP           200 non-null    object 
 3   Cholesterol  200 non-null    object 
 4   Na_to_K      200 non-null    float64
 5   Drug         200 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 9.5+ KB


### Univariate Analysis

In [None]:
# Imports for dashboards
import dash
from jupyter_dash import JupyterDash
from dash import html, dcc
from dash.dependencies import Input, Output
import plotly.graph_objects as go

In [None]:
# Create the application for Univariate Analysis
app = JupyterDash(__name__)

app.layout = html.Div(id = 'parent', children = [
        
        # Define dropdown
        dcc.Dropdown( id = 'dropdown',
        options = [{'label':c, 'value':c } for c in explore_df.columns],
            value = 'Na_to_K'),
            dcc.Graph(id = 'bar_plot')],
        style = {'width': '20%'})
    
@app.callback(Output(component_id='bar_plot', component_property='figure'),
              [Input(component_id='dropdown', component_property='value')])

# Update the graph on callback
def graph_update(dropdown_value):
    fig = px.histogram(x = explore_df['{}'.format(dropdown_value)],
                           title='Univariate Analysis',
                           labels={'x':dropdown_value},
                           nbins=30,
                           color_discrete_sequence=colorblind_seq,
                           width=700,
                           height=500,)
    # Return the new graph
    return fig  

# Run app and display result inline in the notebook
app.run_server(mode='inline')

<IPython.core.display.Javascript object>

**Inference:** Na_to_k has a positive skew. 

In [None]:
# Skewness
print("Skewness of Na_to_K: %f" % explore_df['Na_to_K'].skew())

Skewness of Na_to_K: 1.039341


**Inference**: Drug Y is widely used compared to other drugs.

### Bivariate Analysis

In [None]:
# Create the application for graph of Drug, Age, Na_to_K
app = JupyterDash(__name__)

app.layout = html.Div(id = 'parent', children = [
        
        # Define dropdown
        dcc.Dropdown( id = 'dropdown',
        options = [{'label':'Na_to_K', 'value':'Na_to_K'},
                   {'label':'Age', 'value':'Age'}],
            value = 'Na_to_K'),
            dcc.Graph(id = 'bar_plot')],
        style = {'width': '20%'})
    
@app.callback(Output(component_id ='bar_plot', component_property='figure'),
              [Input(component_id ='dropdown', component_property='value')])

# Update the graph on callback
def graph_update(dropdown_value):
    fig = px.scatter(x = explore_df['Drug'],
                       y = explore_df['{}'.format(dropdown_value)],
                           title='Bivariate Analysis',
                           labels={'x':'Drug','y':'{}'.format(dropdown_value)},
                           color_discrete_sequence=colorblind_seq,
                           width=700,
                           height=500,)
    # Return the new graph
    return fig  

# Run app and display result inline in the notebook
app.run_server(mode='inline')

<IPython.core.display.Javascript object>

### Multivariate Analysis

In [None]:
# Create the application
app = JupyterDash(__name__)

app.layout = html.Div(id = 'parent', children = [
        
        # Define dropdown
        dcc.Dropdown( id = 'dropdown',
        options = [{'label':'Sex', 'value':'Sex'},
                   {'label':'BP', 'value':'BP'},
                   {'label':'Cholesterol', 'value':'Cholesterol'}],
            value = 'Sex'),
            dcc.Graph(id = 'bar_plot')],
        style = {'width': '20%'})
    
@app.callback(Output(component_id ='bar_plot', component_property = 'figure'),
              [Input(component_id ='dropdown', component_property = 'value')])

# Update the graph on callback
def graph_update(dropdown_value):
    fig = px.histogram(x = explore_df['Drug'],
                           title = 'Multivariate Analysis',
                           barmode = 'group',
                           color = explore_df['{}'.format(dropdown_value)],
                           labels = {'x':'Drug'},
                           nbins = 30,
                           color_discrete_sequence=colorblind_seq,
                           width = 700,
                           height = 500,)
    # Return the new graph
    return fig  

# Run app and display result inline in the notebook
app.run_server(mode='inline')

<IPython.core.display.Javascript object>

### Data Wrangling

In [None]:
# Make age into 7 groups
bin_age = [0, 19, 29, 39, 49, 59, 69, 80]
category_age = ['<20s', '20s', '30s', '40s', '50s', '60s', '>60s']
explore_df['Age_binned'] = pd.cut(explore_df['Age'], bins=bin_age, labels=category_age)

In [None]:
# Create new column that combines both BP and Cholesterol
explore_df['BP_Chol'] = explore_df['BP'].str.cat(explore_df['Cholesterol'],sep=" ")

In [None]:
# Make Na_to_K into 5 groups
bin_NatoK = [0, 9, 19, 29, 50]
category_NatoK = ['<10', '10-20', '20-30', '>30']
explore_df['Na_to_K_binned'] = pd.cut(explore_df['Na_to_K'], bins=bin_NatoK, labels=category_NatoK)

In [None]:
# Split data into features and target
y_df = explore_df['Drug']
X_df = explore_df.drop(['Drug'],axis=1)

## Split, Encode, Train, and Test the Data

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.2, random_state=22)

In [None]:
# Encode the data
X_train, X_test = [pd.get_dummies(df) for df in [X_train, X_test]]
y_train, y_test = [pd.get_dummies(df) for df in [y_train, y_test]]

In [None]:
# Feed pipeline into GridSearchCV
pipeline = Pipeline([('scaler' ,StandardScaler()),
                     ('rfc',RandomForestClassifier())])

param_grid = {
    'rfc__max_depth': [4, 5, 10],
    'rfc__max_features': [2, 3],
    'rfc__min_samples_leaf': [3, 4, 5],
    'rfc__n_estimators': [100, 200, 300]}

# Initialize
grid_pipeline = GridSearchCV(pipeline, param_grid, n_jobs=-1, verbose=1, cv=5, scoring='f1')
# Fit
grid_pipeline.fit(X_train,y_train)
grid_pipeline.best_params_

Fitting 5 folds for each of 54 candidates, totalling 270 fits



One or more of the test scores are non-finite: [nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan]



{'rfc__max_depth': 4,
 'rfc__max_features': 2,
 'rfc__min_samples_leaf': 3,
 'rfc__n_estimators': 100}

In [None]:
# Train the data on the Random Forest Classifier and test the accuracy
rfc = RandomForestClassifier(max_depth=4, max_features=2, min_samples_leaf=3, n_estimators=100)
rfc.fit(X_train,y_train)

# Making predictions
y_train_pred = rfc.predict(X_train)
y_test_pred = rfc.predict(X_test)

# Get Scores
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
print('Results for Random Forest Classifier')
print('The training accuracy is',round((train_accuracy*100),2),'%')
print('The test accuracy is', round((test_accuracy*100),2),'%')


Results for Random Forest Classifier
The training accuracy is 61.25 %
The test accuracy is 57.5 %


In [None]:
''' Classification report will return 0.0 (warning) when 
    there is not a f1 score to calculate for a label '''
import warnings
warnings.filterwarnings('ignore')
print(classification_report(y_test, y_test_pred, labels=np.unique(y_test_pred)))

              precision    recall  f1-score   support

           0       1.00      0.95      0.97        20
           1       0.00      0.00      0.00         7

   micro avg       1.00      0.70      0.83        27
   macro avg       0.50      0.47      0.49        27
weighted avg       0.74      0.70      0.72        27
 samples avg       0.47      0.47      0.47        27



In [None]:
# Feed pipeline into GridSearchCV
pipeline = Pipeline([('scaler', StandardScaler()),
                     ('dtc', DecisionTreeClassifier())])

param_grid = {
    'dtc__min_samples_leaf':[5,10,15],
    'dtc__criterion':['gini', 'entropy'],
    'dtc__max_depth':[2,4,6,8,10,12]}
# Initialize
grid_pipeline = GridSearchCV(pipeline, param_grid, n_jobs=-1, verbose=1, cv=5, scoring='f1')
# Fit
grid_pipeline.fit(X_train,y_train)
grid_pipeline.best_params_

Fitting 5 folds for each of 36 candidates, totalling 180 fits


{'dtc__criterion': 'gini', 'dtc__max_depth': 2, 'dtc__min_samples_leaf': 5}

In [None]:
# Train the data on the Decision Tree Classifier and test the accuracy
dtc = DecisionTreeClassifier(criterion='gini', max_depth=2, min_samples_leaf=5)
dtc.fit(X_train,y_train)

# Making predictions
y_train_pred = dtc.predict(X_train)
y_test_pred = dtc.predict(X_test)

# Get Scores
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
print('Results for Decision Tree Classifier')
print('The training accuracy is',round((train_accuracy*100),2),'%')
print('The test accuracy is', round((test_accuracy*100),2),'%')

Results for Decision Tree Classifier
The training accuracy is 83.12 %
The test accuracy is 85.0 %


In [None]:
print(classification_report(y_test, y_test_pred, labels=np.unique(y_test_pred)))

              precision    recall  f1-score   support

           0       0.95      1.00      0.98        20
           1       0.64      1.00      0.78         7

   micro avg       0.84      1.00      0.92        27
   macro avg       0.79      1.00      0.88        27
weighted avg       0.87      1.00      0.92        27
 samples avg       0.68      0.68      0.68        27

