In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.functions import col
import numpy as np
import os

In [2]:
os.environ["SPARK_HOME"] = "C:/spark-2.4.4-bin-hadoop2.7"
os.environ["HADOOP_HOME"] = "C:/winutils"

In [3]:
# Creating spark session
spark = SparkSession.builder.appName("ICP7").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

In [4]:
# Loading the dataset
ICP7 = spark.read.format("csv").option("header", True).option("inferSchema", True).option("delimiter", ",").load("D:/Datasets/Clustering/dataset_diabetes/diabetic_data.csv")

In [5]:
ICP7.printSchema()

root
 |-- encounter_id: integer (nullable = true)
 |-- patient_nbr: integer (nullable = true)
 |-- race: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: string (nullable = true)
 |-- weight: string (nullable = true)
 |-- admission_type_id: integer (nullable = true)
 |-- discharge_disposition_id: integer (nullable = true)
 |-- admission_source_id: integer (nullable = true)
 |-- time_in_hospital: integer (nullable = true)
 |-- payer_code: string (nullable = true)
 |-- medical_specialty: string (nullable = true)
 |-- num_lab_procedures: integer (nullable = true)
 |-- num_procedures: integer (nullable = true)
 |-- num_medications: integer (nullable = true)
 |-- number_outpatient: integer (nullable = true)
 |-- number_emergency: integer (nullable = true)
 |-- number_inpatient: integer (nullable = true)
 |-- diag_1: string (nullable = true)
 |-- diag_2: string (nullable = true)
 |-- diag_3: string (nullable = true)
 |-- number_diagnoses: integer (nullable = true)
 |-

In [6]:
numeric_features = [t[0] for t in ICP7.dtypes if t[1] == 'int']
ICP7.select(numeric_features).describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
encounter_id,101766,1.652016456229782E8,1.0264029598345728E8,12522,443867222
patient_nbr,101766,5.4330400694947235E7,3.869635934653422E7,135,189502619
admission_type_id,101766,2.024006053102215,1.445402829756122,1,8
discharge_disposition_id,101766,3.7156417664052825,5.280165509299273,1,28
admission_source_id,101766,5.754436648782501,4.0640808342839,1,25
time_in_hospital,101766,4.395986871843248,2.985107767471266,1,14
num_lab_procedures,101766,43.09564098028811,19.67436224914214,1,132
num_procedures,101766,1.339730361810428,1.7058069791211594,0,6
num_medications,101766,16.021844230882614,8.127566209167286,1,81


In [7]:
ICP7 = ICP7.select("encounter_id", "patient_nbr", "admission_type_id", "discharge_disposition_id", "admission_source_id", "time_in_hospital", "num_lab_procedures", "num_procedures", "num_medications", "number_outpatient", "number_emergency", "number_inpatient", "number_diagnoses")

In [8]:
# Create vector assembler for feature columns
assembler = VectorAssembler(inputCols=ICP7.columns, outputCol="features")
ICP7 = assembler.transform(ICP7)

In [10]:
# Trains a k-means model.
kmeans = KMeans().setK(2).setSeed(1)
model = kmeans.fit(ICP7)

In [11]:
# Make predictions
predictions = model.transform(ICP7)

# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Cluster Centers: 
[1.06791897e+08 4.38505064e+07 2.15585583e+00 4.03063473e+00
 5.89684519e+00 4.50131934e+00 4.29558239e+01 1.34394120e+00
 1.56808653e+01 2.88398527e-01 1.51709340e-01 6.09302056e-01
 7.12043669e+00]
[2.88059477e+08 7.63735890e+07 1.74667602e+00 3.05309222e+00
 5.45489754e+00 4.17443279e+00 4.33897292e+01 1.33087338e+00
 1.67390522e+01 5.39643816e-01 2.94858502e-01 6.90808734e-01
 8.05818492e+00]
