In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from scipy.stats import shapiro
import matplotlib.pyplot as plt 
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score 
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import GridSearchCV 
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

df = pd.read_csv('/kaggle/input/drug-classification/drug200.csv')
df.head()
# Checking Unique Values in Bp and Cholestrol for encodig them 
df['BP'].unique()  # All Values - High Low Normal 
df['Cholesterol'].unique() # High , Normal
# Encoding Bp and cholestrol 
df['BP'] = df['BP'].map({'HIGH' : 1 , 'LOW' : -1 , 'NORMAL' : 0})
df.head()
df['Cholesterol'] = df['Cholesterol'].map({'HIGH' : 1 , 'NORMAL' : -1})
df.head()
# Performing Statistical Tests to determine wheather to perform Standardization or Normalization on Data
# Test -1 : Shapiro Wilk Test 
data = df['Na_to_K'].dropna()
stat , p = shapiro(data)
print('Statistics=%.3f, p=%.3f' % (stat, p))
# stat = 0.902 and p = 0.000 suggesting data is not normally distributed 
# Drawing a histogram to see if the data is normally distrubuted 
#plt.figure(figsize=(10, 6))  
#plt.hist(data, bins=15, color='blue', alpha=0.7)  
#plt.title('Histogram of Na_to_K Name')  
#plt.xlabel('Value')  
#plt.ylabel('Frequency')  
#plt.grid(axis='y', alpha=0.75)  
#plt.show() 
# Data is left skewed , will do normalization on the Na_to_K column
min_value = df['Na_to_K'].min()
max_value = df['Na_to_K'].max()
df['Normalized_Na_to_K'] = (df['Na_to_K'] - min_value) / (max_value - min_value)

# Encoding Values in the drug column and sex column
df['Drug'].unique()
df['Drug'] = df['Drug'].map({'DrugY' : 0 , 'drugC' : 1 , 'drugX' : 2 , 'drugA' : 3 , 'drugB' : 4 })
df.head()
df['Sex'].unique()
df['Sex'] = df['Sex'].map({'F' : 0 , 'M' : 1})

# Using Multinomial Logisitic Regression to make predictions 
# Making Feature and target columns
X = df[['Age' , 'Sex' , 'BP' , 'Cholesterol', 'Normalized_Na_to_K']]
y = df[['Drug']]
# Splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 
# Tuning the hyperparameter for best results 
param_grid = {
    'C' : [0.001 , 0.01 , 0.1  , 1 , 10 , 100],
    'solver': ['lbfgs' , 'saga'],
    'max_iter' : [100 , 200 , 300] , 
    'penalty' : ['l1' , 'l2'],
    'multi_class' : ['ovr' , 'multinomial']
}
# Initialising the Model 
model = LogisticRegression(
    C = 100,
    max_iter = 100, 
    multi_class = 'ovr',
    penalty = 'l2',
    solver = 'lbfgs'
)
model.fit(X_train , y_train)
y_pred = model.predict(X_test)
print(f"Best Results : {y_pred}")
accuracy_score = accuracy_score(y_pred , y_test)
print(f"ACCURACY SCORE : {accuracy_score}")
# Accuracy Score = 97.5% 
# Performing the Grid Search 
#grid_search = GridSearchCV(model , param_grid , cv = 5 , scoring= 'accuracy', error_score=np.nan)
# Fitting data into grid-search
#grid_search.fit(X_train , y_train)
#print("Best hyperparameters:", grid_search.best_params_) #Best hyperparameters: {'C': 100, 'max_iter': 100, 'multi_class': 'ovr', 'penalty': 'l2', 'solver': 'lbfgs'}
# Making Prediction and checking accuracy score 
#best_model = grid_search.best_estimator_ 
#y_pred = best_model.predict(X_test)
#print(f"Best Results : {y_pred} ")
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session