In [7]:
# Must be included at the beginning of each new notebook. Remember to change the app name.
import findspark
findspark.init('/home/ubuntu/spark-3.2.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('basics').getOrCreate()

In [8]:
df = spark.read.csv('data/transformed.csv',inferSchema=True)

In [9]:
# Optionally, rename columns for better clarity
df = df.withColumnRenamed("_c0", "budget") \
       .withColumnRenamed("_c1", "popularity") \
       .withColumnRenamed("_c2", "revenue") \
       .withColumnRenamed("_c3", "runtime") \
       .withColumnRenamed("_c4", "vote_count") \
       .withColumnRenamed("_c5", "genre") \
       .withColumnRenamed("_c6", "release_date") \
       .withColumnRenamed("_c7", "production_country")\
       .withColumnRenamed("_c8", "popularity_rank")\
       .withColumnRenamed("_c9", "risk")
# Let's get an idea of what the data looks like. 
df.printSchema()
df.show()

root
 |-- budget: integer (nullable = true)
 |-- popularity: double (nullable = true)
 |-- revenue: integer (nullable = true)
 |-- runtime: integer (nullable = true)
 |-- vote_count: integer (nullable = true)
 |-- genre: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- production_country: string (nullable = true)
 |-- popularity_rank: string (nullable = true)
 |-- risk: string (nullable = true)

+---------+----------+----------+-------+----------+---------------+------------+------------------+---------------+----+
|   budget|popularity|   revenue|runtime|vote_count|          genre|release_date|production_country|popularity_rank|risk|
+---------+----------+----------+-------+----------+---------------+------------+------------------+---------------+----+
|300000000|139.082615| 961000000|    169|      4500|         Action|  2007-05-19|      united_state|           high| low|
|245000000|107.376788| 880674609|    148|      4466|         Action|  2015-10-26|       

In [10]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


# Convert categorical columns into numerical representations
indexer_genre = StringIndexer(inputCol="genre", outputCol="genre_index")
indexer_country = StringIndexer(inputCol="production_country", outputCol="country_index")
indexer_risk = StringIndexer(inputCol="risk", outputCol="risk_index")
indexer_rank = StringIndexer(inputCol="popularity_rank", outputCol="rank_index")

# Apply StringIndexer transformations
df = indexer_genre.fit(df).transform(df)
df = indexer_country.fit(df).transform(df)
df = indexer_risk.fit(df).transform(df)
df = indexer_rank.fit(df).transform(df)

In [15]:
from pyspark.sql import SparkSession
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import ClusteringEvaluator
import pyspark.sql.functions as F

# Initialize Spark session
spark = SparkSession.builder \
    .appName("KMeans Clustering") \
    .getOrCreate()

# Load data
data = df

# Preprocess data and create feature vector
feature_cols = ['genre_index', 'country_index', 'risk_index', 'rank_index']
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
data = assembler.transform(data)

# Train K-means model
kmeans = KMeans(k=3, seed=1)  # Specify the number of clusters (k)
model = kmeans.fit(data)

# Make predictions
predictions = model.transform(data)

# Evaluate model
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)
print(f"Silhouette with squared euclidean distance = {silhouette}")



# Compare clusters by aggregating features
cluster_comparison = predictions.groupby("prediction").agg(
    F.mean("genre_index").alias("mean_genre_index"),
    F.mean("country_index").alias("mean_country_index"),
    F.mean("risk_index").alias("mean_risk_index"),
    F.mean("rank_index").alias("mean_rank_index")
)

# Show cluster comparison
cluster_comparison.show()

# Calculate cluster sizes
cluster_sizes = predictions.groupBy("prediction").count()

# Show cluster sizes
cluster_sizes.show()



# Save or export results
# predictions.select("genre_index", "country_index", "risk_index", "rank_index", "prediction") \
#     .write.csv("data/kmeans_predictions.csv", header=True)

# Stop Spark session
spark.stop()


Silhouette with squared euclidean distance = 0.7217369713952484
+----------+------------------+-------------------+-------------------+--------------------+
|prediction|  mean_genre_index| mean_country_index|    mean_risk_index|     mean_rank_index|
+----------+------------------+-------------------+-------------------+--------------------+
|         1|0.3193717277486911|0.48616305160807777| 0.2610321615557218|0.005983545250560957|
|         2|6.9033613445378155| 0.4789915966386555|0.24789915966386555|0.037815126050420166|
|         0|3.1424100156494523| 0.5226917057902973|  0.215962441314554| 0.00782472613458529|
+----------+------------------+-------------------+-------------------+--------------------+

+----------+-----+
|prediction|count|
+----------+-----+
|         1| 1337|
|         2|  238|
|         0|  639|
+----------+-----+



In [13]:
result = spark.read.csv('data/kmeans_predictions.csv',inferSchema=True)

In [14]:
result.show()

+-----------+-------------+----------+----------+----------+
|        _c0|          _c1|       _c2|       _c3|       _c4|
+-----------+-------------+----------+----------+----------+
|genre_index|country_index|risk_index|rank_index|prediction|
|        1.0|          0.0|       0.0|       0.0|         1|
|        1.0|          1.0|       0.0|       0.0|         1|
|        1.0|          0.0|       0.0|       0.0|         1|
|        1.0|          0.0|       0.0|       0.0|         1|
|        6.0|          0.0|       0.0|       0.0|         2|
|        1.0|          0.0|       0.0|       0.0|         1|
|        3.0|          1.0|       0.0|       0.0|         0|
|        1.0|          0.0|       0.0|       0.0|         1|
|        1.0|          0.0|       0.0|       0.0|         1|
|        3.0|          1.0|       0.0|       0.0|         0|
|        3.0|          1.0|       0.0|       0.0|         0|
|        1.0|          0.0|       1.0|       0.0|         1|
|        1.0|          1