In [1]:
pip install pyspark




# About this file
## The data set contains various information that effect the predictions like Age, Sex, BP, Cholesterol levels, Na to Potassium Ratio and finally the drug type.

### Import necessary libraries

In [2]:
import numpy as np
import pandas as pd
import pyspark
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf # @udf("integer") def myfunc(x,y): return x - y
from pyspark.sql import functions as F # stddev format_number date_format, dayofyear, when
from pyspark.sql.types import StructField, StringType, IntegerType, StructType

print([(x.__name__,x.__version__) for x in [np, pd, pyspark]])

spark = pyspark.sql.SparkSession.builder.appName('Drug_calssification').getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc)
sc.setLogLevel("INFO")

[('numpy', '1.21.6'), ('pandas', '1.3.5'), ('pyspark', '3.2.1')]




In [3]:
from pyspark.ml.feature import (VectorAssembler, VectorIndexer,
                               OneHotEncoder, StringIndexer)

from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

### Read the dataset

In [4]:
df = spark.read.csv('drug200.csv',header=True,inferSchema=True)
print(df.count())
df.show()

200
+---+---+------+-----------+-------+-----+
|Age|Sex|    BP|Cholesterol|Na_to_K| Drug|
+---+---+------+-----------+-------+-----+
| 23|  F|  HIGH|       HIGH| 25.355|DrugY|
| 47|  M|   LOW|       HIGH| 13.093|drugC|
| 47|  M|   LOW|       HIGH| 10.114|drugC|
| 28|  F|NORMAL|       HIGH|  7.798|drugX|
| 61|  F|   LOW|       HIGH| 18.043|DrugY|
| 22|  F|NORMAL|       HIGH|  8.607|drugX|
| 49|  F|NORMAL|       HIGH| 16.275|DrugY|
| 41|  M|   LOW|       HIGH| 11.037|drugC|
| 60|  M|NORMAL|       HIGH| 15.171|DrugY|
| 43|  M|   LOW|     NORMAL| 19.368|DrugY|
| 47|  F|   LOW|       HIGH| 11.767|drugC|
| 34|  F|  HIGH|     NORMAL| 19.199|DrugY|
| 43|  M|   LOW|       HIGH| 15.376|DrugY|
| 74|  F|   LOW|       HIGH| 20.942|DrugY|
| 50|  F|NORMAL|       HIGH| 12.703|drugX|
| 16|  F|  HIGH|     NORMAL| 15.516|DrugY|
| 69|  M|   LOW|     NORMAL| 11.455|drugX|
| 43|  M|  HIGH|       HIGH| 13.972|drugA|
| 23|  M|   LOW|       HIGH|  7.298|drugC|
| 32|  F|  HIGH|     NORMAL| 25.974|DrugY|
+---+--

In [5]:
df.printSchema()

root
 |-- Age: integer (nullable = true)
 |-- Sex: string (nullable = true)
 |-- BP: string (nullable = true)
 |-- Cholesterol: string (nullable = true)
 |-- Na_to_K: double (nullable = true)
 |-- Drug: string (nullable = true)



In [6]:
print(df.columns)

['Age', 'Sex', 'BP', 'Cholesterol', 'Na_to_K', 'Drug']


In [7]:
my_cols = [ 'Age', 'Sex', 'BP', 'Cholesterol', 'Na_to_K', 'Drug']
df = df.select(my_cols)

In [8]:
from pyspark.sql.functions import isnan, when, count, col
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+---+---+---+-----------+-------+----+
|Age|Sex| BP|Cholesterol|Na_to_K|Drug|
+---+---+---+-----------+-------+----+
|  0|  0|  0|          0|      0|   0|
+---+---+---+-----------+-------+----+



### Label Encoding and one-hot encoding

In [9]:
Sex_indexer = StringIndexer(inputCol='Sex', outputCol='Sex_index')

Sex_encoder = OneHotEncoder(inputCol='Sex_index', outputCol='Sex_vec')

In [10]:
BP_indexer = StringIndexer(inputCol='BP', outputCol='BP_index')

BP_encoder = OneHotEncoder(inputCol='BP_index', outputCol='BP_vec')

In [11]:
Cholesterol_indexer = StringIndexer(inputCol='Cholesterol', outputCol='Cholesterol_index')

Cholesterol_encoder = OneHotEncoder(inputCol='Cholesterol_index', outputCol='Cholesterol_vec')

In [12]:
Drug_indexer = StringIndexer(inputCol='Drug', outputCol='Drug_index')

In [13]:
df.columns

['Age', 'Sex', 'BP', 'Cholesterol', 'Na_to_K', 'Drug']

In [14]:
assembler = VectorAssembler(inputCols=['Age', 'Sex_vec', 'BP_vec', 'Cholesterol_vec', 'Na_to_K'],
                           outputCol='features')


# Logistic Regression Model

In [15]:
lr = LogisticRegression(featuresCol='features',labelCol='Drug_index')

In [16]:
pipeline = Pipeline(stages=[Sex_indexer, BP_indexer,Cholesterol_indexer,
                           Sex_encoder, BP_encoder, Cholesterol_encoder, Drug_indexer,
                           assembler, lr])

In [17]:
train, test = df.randomSplit([0.7, 0.3])

In [18]:
lr_model = pipeline.fit(train)

In [19]:
results = lr_model.transform(test)
results.show(5)

+---+---+------+-----------+-------+-----+---------+--------+-----------------+---------+-------------+---------------+----------+--------------------+--------------------+--------------------+----------+
|Age|Sex|    BP|Cholesterol|Na_to_K| Drug|Sex_index|BP_index|Cholesterol_index|  Sex_vec|       BP_vec|Cholesterol_vec|Drug_index|            features|       rawPrediction|         probability|prediction|
+---+---+------+-----------+-------+-----+---------+--------+-----------------+---------+-------------+---------------+----------+--------------------+--------------------+--------------------+----------+
| 16|  F|  HIGH|     NORMAL| 15.516|DrugY|      1.0|     0.0|              1.0|(1,[],[])|(2,[0],[1.0])|      (1,[],[])|       0.0|[16.0,0.0,1.0,0.0...|[91.4228114151397...|[7.42493825560768...|       2.0|
| 18|  F|  HIGH|       HIGH| 37.188|DrugY|      1.0|     0.0|              0.0|(1,[],[])|(2,[0],[1.0])|  (1,[0],[1.0])|       0.0|[18.0,0.0,1.0,0.0...|[738.634530989924...|[1.0,0.0

In [20]:
my_eval = MulticlassClassificationEvaluator(predictionCol='prediction',
                                       labelCol='Drug_index')

In [21]:
results.select('Drug_index','prediction').show()

+----------+----------+
|Drug_index|prediction|
+----------+----------+
|       0.0|       2.0|
|       0.0|       0.0|
|       2.0|       2.0|
|       1.0|       1.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       1.0|       1.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       4.0|       4.0|
|       1.0|       1.0|
|       0.0|       0.0|
|       2.0|       2.0|
|       2.0|       2.0|
|       2.0|       2.0|
|       0.0|       0.0|
|       4.0|       4.0|
|       1.0|       1.0|
|       1.0|       1.0|
+----------+----------+
only showing top 20 rows



In [22]:
auc = my_eval.evaluate(results)
auc

0.9670833333333334

# Random Forest Classifier 

In [23]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'Drug_index')

In [24]:
pipeline2 = Pipeline(stages=[Sex_indexer, BP_indexer,Cholesterol_indexer,
                           Sex_encoder, BP_encoder, Cholesterol_encoder, Drug_indexer,
                           assembler, rf])

In [25]:
train, test = df.randomSplit([0.7, 0.3])

In [26]:
rf_model = pipeline2.fit(train)

In [27]:
results2 = rf_model.transform(test)
results2.show(5)

+---+---+------+-----------+-------+-----+---------+--------+-----------------+-------------+-------------+---------------+----------+--------------------+--------------------+--------------------+----------+
|Age|Sex|    BP|Cholesterol|Na_to_K| Drug|Sex_index|BP_index|Cholesterol_index|      Sex_vec|       BP_vec|Cholesterol_vec|Drug_index|            features|       rawPrediction|         probability|prediction|
+---+---+------+-----------+-------+-----+---------+--------+-----------------+-------------+-------------+---------------+----------+--------------------+--------------------+--------------------+----------+
| 15|  M|  HIGH|     NORMAL| 17.206|DrugY|      0.0|     0.0|              0.0|(1,[0],[1.0])|(2,[0],[1.0])|  (1,[0],[1.0])|       0.0|[15.0,1.0,1.0,0.0...|[20.0,0.0,0.0,0.0...|[1.0,0.0,0.0,0.0,...|       0.0|
| 17|  M|NORMAL|     NORMAL| 10.832|drugX|      0.0|     2.0|              0.0|(1,[0],[1.0])|    (2,[],[])|  (1,[0],[1.0])|       1.0|[17.0,1.0,0.0,0.0...|[0.0,17.8

In [28]:
my_eval2 = MulticlassClassificationEvaluator(predictionCol='prediction',
                                       labelCol='Drug_index')

In [29]:
results2.select('Drug_index','prediction').show()

+----------+----------+
|Drug_index|prediction|
+----------+----------+
|       0.0|       0.0|
|       1.0|       1.0|
|       0.0|       0.0|
|       1.0|       1.0|
|       1.0|       1.0|
|       2.0|       2.0|
|       4.0|       4.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       1.0|       1.0|
|       2.0|       2.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       1.0|       1.0|
|       1.0|       1.0|
|       0.0|       0.0|
|       0.0|       0.0|
|       1.0|       1.0|
|       0.0|       0.0|
+----------+----------+
only showing top 20 rows



In [30]:
auc2 = my_eval2.evaluate(results2)
auc2

1.0