**Clustering Techniques in Data Mining - Question 1**


In [None]:
from google.colab import drive

In [None]:
drive.mount ('/content/drive')

Mounted at /content/drive


In [None]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 KB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845512 sha256=2411f0482e1d3db835a51dc44169e2523eb93e2da7e0101d88a718b258a2162a
  Stored in directory: /root/.cache/pip/wheels/43/dc/11/ec201cd671da62fa9c5cc77078235e40722170ceba231d7598
Successfully built pyspark
Installing collected packages: py4j, pyspa

In [None]:
!pip install seaborn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pyspark
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import CountVectorizer, IDF
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, DoubleType
import seaborn as sns
from pyspark.sql.functions import split
import matplotlib.pyplot as plt
from pyspark.ml.feature import VectorAssembler
import pandas as pd
import numpy as np

In [None]:
# Initialize Spark
spark = SparkSession.builder \
.master("local") \
.appName("KMeans Clustering") \
.getOrCreate()

In [None]:
# Load data
data = spark.read.format("csv").option("encoding",'latin1').option("multiline","true").option("quote", "\"").option("escape", "\"").options(header="true").load('/content/drive/My Drive/datamining/description.csv')
data.show(truncate=False)

+-------+----------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
# Selecting the first 135 rows of the description column
desc = data.select(data.description)
document = desc.limit(135)
document.show()

+--------------------+
|         description|
+--------------------+
|A tender, moving ...|
|Born into a Jewis...|
|Imbued on every p...|
|A celebrated writ...|
|A bank of clouds ...|
|Maybe it was a gr...|
|The true story of...|
|In April 1992 a y...|
|Sent by their mot...|
|This book chronic...|
|Perhaps if Joe an...|
|Wise, funny, and ...|
|Through a life of...|
|At one time Corri...|
|Delve into the ma...|
|Augustine's Confe...|
|David Sedaris' mo...|
|John and Jenny we...|
|The enthralling, ...|
|Psychiatrist Vikt...|
+--------------------+
only showing top 20 rows



In [None]:
# Checking for null values
document.filter(document.description.isNull()).show()

+-----------+
|description|
+-----------+
|       null|
|       null|
|       null|
|       null|
|       null|
|       null|
|       null|
|       null|
|       null|
|       null|
|       null|
|       null|
|       null|
|       null|
|       null|
|       null|
|       null|
|       null|
|       null|
+-----------+



In [None]:
# Drop null values
document = data.na.drop()

In [None]:
# Checking for null values again
doc2.filter(doc2.description.isNull()).show()

+-----------+
|description|
+-----------+
+-----------+



In [None]:
from pyspark.ml.feature import StopWordsRemover, HashingTF, IDF, Tokenizer, RegexTokenizer


In [None]:
# Tokenize sentences
regexTokenizer = RegexTokenizer(inputCol="description", outputCol="words", pattern="\\W")
wordsData = regexTokenizer.transform(doc2)
wordsData.show()

+--------------------+--------------------+
|         description|               words|
+--------------------+--------------------+
|A tender, moving ...|[a, tender, movin...|
|Born into a Jewis...|[born, into, a, j...|
|Imbued on every p...|[imbued, on, ever...|
|A celebrated writ...|[a, celebrated, w...|
|A bank of clouds ...|[a, bank, of, clo...|
|Maybe it was a gr...|[maybe, it, was, ...|
|The true story of...|[the, true, story...|
|In April 1992 a y...|[in, april, 1992,...|
|Sent by their mot...|[sent, by, their,...|
|This book chronic...|[this, book, chro...|
|Perhaps if Joe an...|[perhaps, if, joe...|
|Wise, funny, and ...|[wise, funny, and...|
|Through a life of...|[through, a, life...|
|At one time Corri...|[at, one, time, c...|
|Delve into the ma...|[delve, into, the...|
|Augustine's Confe...|[augustine, s, co...|
|David Sedaris' mo...|[david, sedaris, ...|
|John and Jenny we...|[john, and, jenny...|
|The enthralling, ...|[the, enthralling...|
|Psychiatrist Vikt...|[psychiatr

In [None]:
# Remove stop words
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
filteredData = remover.transform(wordsData)
filteredData.show()

+--------------------+--------------------+--------------------+
|         description|               words|            filtered|
+--------------------+--------------------+--------------------+
|A tender, moving ...|[a, tender, movin...|[tender, moving, ...|
|Born into a Jewis...|[born, into, a, j...|[born, jewish, gh...|
|Imbued on every p...|[imbued, on, ever...|[imbued, every, p...|
|A celebrated writ...|[a, celebrated, w...|[celebrated, writ...|
|A bank of clouds ...|[a, bank, of, clo...|[bank, clouds, as...|
|Maybe it was a gr...|[maybe, it, was, ...|[maybe, grandpare...|
|The true story of...|[the, true, story...|[true, story, out...|
|In April 1992 a y...|[in, april, 1992,...|[april, 1992, you...|
|Sent by their mot...|[sent, by, their,...|[sent, mother, li...|
|This book chronic...|[this, book, chro...|[book, chronicles...|
|Perhaps if Joe an...|[perhaps, if, joe...|[perhaps, joe, vi...|
|Wise, funny, and ...|[wise, funny, and...|[wise, funny, hea...|
|Through a life of...|[th

In [None]:
# Tf-Idf
# Use count vectorizer to generate terms
cv = CountVectorizer(inputCol="filtered", outputCol="rawFeatures")

cvModel = cv.fit(filteredData)

featurizedData = cvModel.transform(filteredData)

featurizedData.show()

+--------------------+--------------------+--------------------+--------------------+
|         description|               words|            filtered|         rawFeatures|
+--------------------+--------------------+--------------------+--------------------+
|A tender, moving ...|[a, tender, movin...|[tender, moving, ...|(4152,[1,2,4,6,7,...|
|Born into a Jewis...|[born, into, a, j...|[born, jewish, gh...|(4152,[2,5,8,20,3...|
|Imbued on every p...|[imbued, on, ever...|[imbued, every, p...|(4152,[0,2,4,11,1...|
|A celebrated writ...|[a, celebrated, w...|[celebrated, writ...|(4152,[1,2,5,6,7,...|
|A bank of clouds ...|[a, bank, of, clo...|[bank, clouds, as...|(4152,[7,28,43,52...|
|Maybe it was a gr...|[maybe, it, was, ...|[maybe, grandpare...|(4152,[1,2,11,13,...|
|The true story of...|[the, true, story...|[true, story, out...|(4152,[4,8,15,25,...|
|In April 1992 a y...|[in, april, 1992,...|[april, 1992, you...|(4152,[1,3,4,8,10...|
|Sent by their mot...|[sent, by, their,...|[sent, moth

In [None]:
# Idf 
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

rescaledData.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+
|         description|               words|            filtered|         rawFeatures|            features|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|A tender, moving ...|[a, tender, movin...|[tender, moving, ...|(4152,[1,2,4,6,7,...|(4152,[1,2,4,6,7,...|
|Born into a Jewis...|[born, into, a, j...|[born, jewish, gh...|(4152,[2,5,8,20,3...|(4152,[2,5,8,20,3...|
|Imbued on every p...|[imbued, on, ever...|[imbued, every, p...|(4152,[0,2,4,11,1...|(4152,[0,2,4,11,1...|
|A celebrated writ...|[a, celebrated, w...|[celebrated, writ...|(4152,[1,2,5,6,7,...|(4152,[1,2,5,6,7,...|
|A bank of clouds ...|[a, bank, of, clo...|[bank, clouds, as...|(4152,[7,28,43,52...|(4152,[7,28,43,52...|
|Maybe it was a gr...|[maybe, it, was, ...|[maybe, grandpare...|(4152,[1,2,11,13,...|(4152,[1,2,11,13,...|
|The true story of...|[the, true, sto

In [None]:
# Train the KMeans model
kmeans = KMeans(k=10, seed=1)
model = kmeans.fit(rescaledData)

In [None]:
prediction = model.transform(rescaledData).select("prediction")
prediction.show()

+----------+
|prediction|
+----------+
|         3|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         4|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         2|
|         1|
|         1|
+----------+
only showing top 20 rows



In [None]:
labels = [p.prediction for p in prediction ]

In [None]:
centers = model.clusterCenters()
centers = np.array(centers)

In [None]:
# Find the terms in the first 3 clusters
terms = cvModel.vocabulary
# order_centroid = model.clusterCenters()
order_centroids = centers.argsort()[:, ::-1]
for i in range(3):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :]:
        print(' %s,' % terms[ind], end =''),
    print("\n")

Cluster 0:
 crest, scattered, pleasures, ahead, forging, terrors, lose, maddened, cheryl, wake, strengthened, hike, impulsive, strayed, driven, thousand, healed, decision, miles, powerfully, destroyed, suspense, pacific, woman, trail, warmth, oregon, mojave, odds, blind, washington, sparkling, lost, training, thought, california, captures, wild, desert, ultimately, marriage, style, humor, alone, nothing, state, soon, death, told, journey, later, twenty, everything, experience, young, made, four, mother, two, years, family, life, one, concise, patterns, mechanisms, adult, message, prayer, entwined, devout, refused, community, elizabeth, experienced, siegel, unparalleled, code, canonical, windows, stress, intervene, level, bigger, guard, decide, blends, nazis, fuel, brilliance, salvation, magical, fend, choose, mary, anda, groundbreaking, tears, fell, involved, devices, cash, fat, beauty, everyone, epilogue, fifteen, calls, daughter, colorado, reveal, instructions, possible, doctor, deca

In [None]:
# Find cluster of 136th book
document2 = desc.subtract(document)
book = document2.limit(1)
book_words = regexTokenizer.transform(book)
book_filtered = remover.transform(book_words)
book_featurized = cvModel.transform(book_filtered)
book_rescaled = idfModel.transform(book_featurized)

prediction = model.transform(book_rescaled)
predictionCol = prediction.select("prediction")
print("The cluster of the 136th book is " + str(predictionCol.collect()[0][0]))

The cluster of the 136th book is 1


In [None]:
# Plot first 5 clusters using seaborn

colors = ['red', 'blue', 'purple', 'green', 'yellow']
ax = sns.scatterplot(rescaledData.select("features")[:,1], rescaledData.select("rawFeatures")[:,1], hue=labels, palette=colors, alpha=0.5, s=7)
# ax = sns.scatterplot(centers[:5, 0], centers[:5, 1],
                    #  hue=range(5), palette=colors, s=20, ec='black', legend=False)
plt.show()