# Importing Libraries

In [85]:
from pylab import *
import pandas as pd
import seaborn as sns

# Loading the Dataset

In [86]:
dataset = pd.read_csv('drug200.csv')

# Understanding the Dataset

In [87]:
dataset.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,DrugY


In [88]:
dataset.shape

(200, 6)

In [89]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          200 non-null    int64  
 1   Sex          200 non-null    object 
 2   BP           200 non-null    object 
 3   Cholesterol  200 non-null    object 
 4   Na_to_K      200 non-null    float64
 5   Drug         200 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 9.5+ KB


In [90]:
dataset.drop_duplicates(inplace=True)

In [91]:
 dataset.shape                               #no duplicates existed

(200, 6)

In [92]:
dataset.head(18)

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,DrugY
5,22,F,NORMAL,HIGH,8.607,drugX
6,49,F,NORMAL,HIGH,16.275,DrugY
7,41,M,LOW,HIGH,11.037,drugC
8,60,M,NORMAL,HIGH,15.171,DrugY
9,43,M,LOW,NORMAL,19.368,DrugY


# Data pre-processing

## Converting categorical data into numerical data using dummy method

### Sex

In [93]:
dummies = pd.get_dummies(dataset['Sex'])
print(dummies)

     F  M
0    1  0
1    0  1
2    0  1
3    1  0
4    1  0
..  .. ..
195  1  0
196  0  1
197  0  1
198  0  1
199  1  0

[200 rows x 2 columns]


In [94]:
df = dataset.join(dummies)

In [95]:
df.rename(columns={"F":"Gender = Female?"},inplace=True)

In [96]:
df.drop(['M'],axis="columns",inplace=True)

In [97]:
df.drop(['Sex'],axis="columns",inplace=True)

In [98]:
df.head()

Unnamed: 0,Age,BP,Cholesterol,Na_to_K,Drug,Gender = Female?
0,23,HIGH,HIGH,25.355,DrugY,1
1,47,LOW,HIGH,13.093,drugC,0
2,47,LOW,HIGH,10.114,drugC,0
3,28,NORMAL,HIGH,7.798,drugX,1
4,61,LOW,HIGH,18.043,DrugY,1


## Encoding using LabelEncoder()

### BP

In [99]:
from sklearn.preprocessing import LabelEncoder

In [100]:
l = LabelEncoder()

In [101]:
bp = l.fit_transform(df['BP'])
dummies = pd.DataFrame(bp)
df2 = df.join(dummies)

In [102]:
df2.head()

Unnamed: 0,Age,BP,Cholesterol,Na_to_K,Drug,Gender = Female?,0
0,23,HIGH,HIGH,25.355,DrugY,1,0
1,47,LOW,HIGH,13.093,drugC,0,1
2,47,LOW,HIGH,10.114,drugC,0,1
3,28,NORMAL,HIGH,7.798,drugX,1,2
4,61,LOW,HIGH,18.043,DrugY,1,1


In [103]:
df2.rename(columns={0:'bp'},inplace=True)
df2.head()

Unnamed: 0,Age,BP,Cholesterol,Na_to_K,Drug,Gender = Female?,bp
0,23,HIGH,HIGH,25.355,DrugY,1,0
1,47,LOW,HIGH,13.093,drugC,0,1
2,47,LOW,HIGH,10.114,drugC,0,1
3,28,NORMAL,HIGH,7.798,drugX,1,2
4,61,LOW,HIGH,18.043,DrugY,1,1


In [104]:
df2.drop(['BP'],axis="columns",inplace=True)

In [105]:
df2.head()

Unnamed: 0,Age,Cholesterol,Na_to_K,Drug,Gender = Female?,bp
0,23,HIGH,25.355,DrugY,1,0
1,47,HIGH,13.093,drugC,0,1
2,47,HIGH,10.114,drugC,0,1
3,28,HIGH,7.798,drugX,1,2
4,61,HIGH,18.043,DrugY,1,1


### Cholestrol

In [106]:
chol = l.fit_transform(df2['Cholesterol'])

In [107]:
dummies = pd.DataFrame(chol)
df3 = df2.join(dummies)

In [109]:
df3.head()

Unnamed: 0,Age,Cholesterol,Na_to_K,Drug,Gender = Female?,bp,0
0,23,HIGH,25.355,DrugY,1,0,0
1,47,HIGH,13.093,drugC,0,1,0
2,47,HIGH,10.114,drugC,0,1,0
3,28,HIGH,7.798,drugX,1,2,0
4,61,HIGH,18.043,DrugY,1,1,0


In [110]:
df3.rename(columns={0:'cholestrol'},inplace=True)
df3.drop(['Cholesterol'],axis="columns",inplace=True)

In [111]:
df3.head()

Unnamed: 0,Age,Na_to_K,Drug,Gender = Female?,bp,cholestrol
0,23,25.355,DrugY,1,0,0
1,47,13.093,drugC,0,1,0
2,47,10.114,drugC,0,1,0
3,28,7.798,drugX,1,2,0
4,61,18.043,DrugY,1,1,0


### Drug

In [112]:
drug = l.fit_transform(df3['Drug'])

In [113]:
dummies = pd.DataFrame(drug)
df4 = df3.join(dummies)

In [114]:
df4.head()

Unnamed: 0,Age,Na_to_K,Drug,Gender = Female?,bp,cholestrol,0
0,23,25.355,DrugY,1,0,0,0
1,47,13.093,drugC,0,1,0,3
2,47,10.114,drugC,0,1,0,3
3,28,7.798,drugX,1,2,0,4
4,61,18.043,DrugY,1,1,0,0


In [116]:
df4.rename(columns={0:'drug'},inplace=True)
df4.head()

Unnamed: 0,Age,Na_to_K,Drug,Gender = Female?,bp,cholestrol,drug
0,23,25.355,DrugY,1,0,0,0
1,47,13.093,drugC,0,1,0,3
2,47,10.114,drugC,0,1,0,3
3,28,7.798,drugX,1,2,0,4
4,61,18.043,DrugY,1,1,0,0


In [117]:
df4.drop(['Drug'],axis="columns",inplace=True)
df4.head()

Unnamed: 0,Age,Na_to_K,Gender = Female?,bp,cholestrol,drug
0,23,25.355,1,0,0,0
1,47,13.093,0,1,0,3
2,47,10.114,0,1,0,3
3,28,7.798,1,2,0,4
4,61,18.043,1,1,0,0


# Splitting Dataset

In [123]:
from sklearn.model_selection import train_test_split

In [127]:
X_train,X_test,Y_train,Y_test = train_test_split(df4.drop(['drug'],axis="columns"),df4['drug'],test_size=0.2,random_state=101)

# Model Development

In [128]:
from sklearn.linear_model import LogisticRegression

In [131]:
l = LogisticRegression()
l.fit(X_train,Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [132]:
pred = l.predict(X_test)

# Finding the accuracy of the model 

In [133]:
from sklearn.metrics import accuracy_score
score = accuracy_score(Y_test,pred)

In [134]:
print(score*100)

72.5
