<html>
<font color = green size = 6>
<b>
KMeans Clustering for Data Exploration
</b>
</font>
</html>

In [1]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
import pyspark.sql.types as typ
sc = SparkContext.getOrCreate()
spark = SQLContext(sc)

In [2]:
refined_df = spark.read.csv(path = "newlogs.csv", header = True,inferSchema = True)

In [3]:
refined_df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- QueryTime: string (nullable = true)
 |-- QuerySeq: integer (nullable = true)
 |-- AvatarID: integer (nullable = true)
 |-- Guild: string (nullable = true)
 |-- Level: integer (nullable = true)
 |-- Race: string (nullable = true)
 |-- Class: string (nullable = true)
 |-- Zone: string (nullable = true)



In [4]:
cluster_df = refined_df["Guild", "Level", "Race", "Class", "AvatarID","Zone"]

<html>
<font color = Red size = 6>
<b> Creating Transformers</b>
</font>
</html>

<html>
<font color = Purple size = 4>
<b> Encoding</b>
</font>
</html>

In [5]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler, VectorIndexer

<html>
<font color = blue>
<b> 
To encode the Guild, Level, Class, Race and Zone columns, we will use the OneHotEncoder method.
However, the method cannot accept StringType columns; it can only deal with
numeric types so first we will cast the columns to IntegerType.
</b>
</font>
</html>

In [6]:
categoricalColumns = ["Guild", "Race", "Class"]
stages = []
for i in categoricalColumns:
    # Category Indexing with StringIndexer
    stringIndexer = StringIndexer(inputCol=i, outputCol=i+"Index")
    # Use OneHotEncoder to convert categorical variables into binary SparseVectors
    encoder = OneHotEncoder(inputCol=i+"Index", outputCol=i+"classVec")
    stages += [stringIndexer, encoder]           

<html>
<font color = Purple size = 4>
<b> Transform all features into a vector using VectorAssembler</b>
</font>
</html>

In [7]:
assemblerinputsmap = map(lambda c: c + "classVec", categoricalColumns)

<html>
<font color = Purple size = 4>
<b> In Python 3 map cannot be added with list and hence using chain to convert it to list</b>
</font>
</html>

In [8]:
from itertools import chain
classvecinputs = []
for i in chain(assemblerinputsmap):
    classvecinputs.append(i)

In [9]:
numericCols = ["Level"]
assemblerinputs = classvecinputs + numericCols

In [10]:
assembler = VectorAssembler(inputCols=assemblerinputs, outputCol="features")
stages += [assembler]

<html>
<font color = Red size = 6>
<b> KMeans clustering</b>
</font>
</html>


In [11]:
import pyspark.ml.clustering as clus
kmeans = clus.KMeans(k = 6,featuresCol='features')
stages += [kmeans]

In [12]:
# Create a Pipeline.
pipeline = Pipeline(stages=stages)

In [13]:
pipelineModel = pipeline.fit(cluster_df)

In [14]:
cluster_df = pipelineModel.transform(cluster_df)

In [15]:
cluster_df.printSchema()

root
 |-- Guild: string (nullable = true)
 |-- Level: integer (nullable = true)
 |-- Race: string (nullable = true)
 |-- Class: string (nullable = true)
 |-- AvatarID: integer (nullable = true)
 |-- Zone: string (nullable = true)
 |-- GuildIndex: double (nullable = true)
 |-- GuildclassVec: vector (nullable = true)
 |-- RaceIndex: double (nullable = true)
 |-- RaceclassVec: vector (nullable = true)
 |-- ClassIndex: double (nullable = true)
 |-- ClassclassVec: vector (nullable = true)
 |-- features: vector (nullable = true)
 |-- prediction: integer (nullable = true)



In [16]:
final_cluster_df = cluster_df["AvatarID", "Guild", "Level", "Race", "Class", "Zone","prediction"]

<html>
<font color = blue>
<b> 
To store as spark partitioned files
</b>
</font>
</html>

final_cluster_df.repartition(1).write.csv('SixClusters.csv')

In [3]:
cluster = spark.read.csv(path = "SixClusters.csv", inferSchema = True)

In [4]:
cluster.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- _c1: integer (nullable = true)
 |-- _c2: integer (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: integer (nullable = true)



In [5]:
cluster = cluster.withColumnRenamed("_c0","AvatarID").withColumnRenamed("_c1","Guild").withColumnRenamed("_c2","Level").withColumnRenamed("_c3","Race").withColumnRenamed("_c4","Class").withColumnRenamed("_c5","Zone").withColumnRenamed("_c6","Prediction")

In [6]:
cluster.show(5)

+--------+-----+-----+----+-------+-------+----------+
|AvatarID|Guild|Level|Race|  Class|   Zone|Prediction|
+--------+-----+-----+----+-------+-------+----------+
|       0| null|    5| Orc|Warrior|Durotar|         3|
|       1| null|    9| Orc| Shaman|Durotar|         3|
|       2| null|   13| Orc| Shaman|Durotar|         3|
|       3|    0|   14| Orc|Warrior|Durotar|         3|
|       4| null|   14| Orc| Shaman|Durotar|         3|
+--------+-----+-----+----+-------+-------+----------+
only showing top 5 rows



<html>
<font color = blue>
<b> 
Now we have the data grouped into 6 clusters. Further detailed Exploratory Analysis on these clusters are performed in the Tableau File: ClusterAnalysis.twbx
</b>
</font>
</html>