In [15]:
# Must be included at the beginning of each new notebook. Remember to change the app name.
import findspark
findspark.init('/home/ubuntu/spark-3.2.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('basics').getOrCreate()

In [16]:
df = spark.read.csv('data/transformed.csv',inferSchema=True)

In [17]:
# Optionally, rename columns for better clarity
df = df.withColumnRenamed("_c0", "budget") \
       .withColumnRenamed("_c1", "popularity") \
       .withColumnRenamed("_c2", "revenue") \
       .withColumnRenamed("_c3", "runtime") \
       .withColumnRenamed("_c4", "vote_count") \
       .withColumnRenamed("_c5", "genre") \
       .withColumnRenamed("_c6", "release_date") \
       .withColumnRenamed("_c7", "production_country")\
       .withColumnRenamed("_c8", "popularity_rank")\
       .withColumnRenamed("_c9", "risk")
# Let's get an idea of what the data looks like. 
df.printSchema()
df.show()

root
 |-- budget: integer (nullable = true)
 |-- popularity: double (nullable = true)
 |-- revenue: integer (nullable = true)
 |-- runtime: integer (nullable = true)
 |-- vote_count: integer (nullable = true)
 |-- genre: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- production_country: string (nullable = true)
 |-- popularity_rank: string (nullable = true)
 |-- risk: string (nullable = true)

+---------+----------+----------+-------+----------+---------------+------------+------------------+---------------+----+
|   budget|popularity|   revenue|runtime|vote_count|          genre|release_date|production_country|popularity_rank|risk|
+---------+----------+----------+-------+----------+---------------+------------+------------------+---------------+----+
|300000000|139.082615| 961000000|    169|      4500|         Action|  2007-05-19|      united_state|           high| low|
|245000000|107.376788| 880674609|    148|      4466|         Action|  2015-10-26|       

In [18]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


# Convert categorical columns into numerical representations
indexer_genre = StringIndexer(inputCol="genre", outputCol="genre_index")
indexer_country = StringIndexer(inputCol="production_country", outputCol="country_index")
indexer_risk = StringIndexer(inputCol="risk", outputCol="risk_index")
indexer_rank = StringIndexer(inputCol="popularity_rank", outputCol="rank_index")

# Apply StringIndexer transformations
df = indexer_genre.fit(df).transform(df)
df = indexer_country.fit(df).transform(df)
df = indexer_risk.fit(df).transform(df)
df = indexer_rank.fit(df).transform(df)

In [19]:
from pyspark.sql.functions import col, when, lit

# Get unique genre numbers
unique_genre_numbers = df.select("genre_index").distinct().count()

# Iterate over each unique genre number and create a new column with 1 for true and 0 for false
for i in range(1, unique_genre_numbers + 1):
    genre_col = f"genre_{i}"
    df = df.withColumn(genre_col, when(col("genre_index") == i, lit(1)).otherwise(lit(0)))

# Show the DataFrame with encoded genres
df.show()

+---------+----------+----------+-------+----------+---------------+------------+------------------+---------------+----+-----------+-------------+----------+----------+-------+-------+-------+-------+-------+-------+-------+-------+-------+--------+--------+
|   budget|popularity|   revenue|runtime|vote_count|          genre|release_date|production_country|popularity_rank|risk|genre_index|country_index|risk_index|rank_index|genre_1|genre_2|genre_3|genre_4|genre_5|genre_6|genre_7|genre_8|genre_9|genre_10|genre_11|
+---------+----------+----------+-------+----------+---------------+------------+------------------+---------------+----+-----------+-------------+----------+----------+-------+-------+-------+-------+-------+-------+-------+-------+-------+--------+--------+
|300000000|139.082615| 961000000|    169|      4500|         Action|  2007-05-19|      united_state|           high| low|        1.0|          0.0|       0.0|       0.0|      1|      0|      0|      0|      0|      0|   

In [20]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Initialize Spark Session
spark = SparkSession.builder.appName("LogisticRegressionExample").getOrCreate()

# List of feature column names
all_feature_columns = ['genre_1', 'genre_2', 'genre_3', 'genre_4', 'genre_5','genre_6', 'genre_7', 'genre_8', 'genre_9', 'genre_10', 'genre_11']

# Assemble features into a single vector
assembler = VectorAssembler(inputCols=all_feature_columns, outputCol="features")
data = assembler.transform(df)

# Split the data into training and test sets
train_data, test_data = data.randomSplit([0.7, 0.3], seed=1234)

# Create and train the Logistic Regression model
lr = LogisticRegression(featuresCol="features", labelCol="rank_index", maxIter=10)
model = lr.fit(train_data)

# Print the coefficients and intercept of the model
print("Coefficients:")
coefficients = model.coefficients
intercept = model.intercept
for i, feature in enumerate(all_feature_columns):
    print(f"{feature}: {coefficients[i]}")

print(f"Intercept: {intercept}")

# Make predictions on the test data
predictions = model.transform(test_data)

# Evaluate the model
evaluator = BinaryClassificationEvaluator(labelCol="rank_index", rawPredictionCol="rawPrediction")
accuracy = evaluator.evaluate(predictions)
print(f"Test Area Under ROC = {accuracy}")

# Stop the Spark session
spark.stop()


Coefficients:
genre_1: 1.4027493451462023
genre_2: -1.2187117230023452
genre_3: -1.2090317385766216
genre_4: -1.1824881514556438
genre_5: 3.15134377543869
genre_6: 2.9154125934420714
genre_7: 1.8627111562701237
genre_8: -1.171063351130465
genre_9: 7.230150469947778
genre_10: 5.506446974005068
genre_11: 0.0
Intercept: -6.124623535270408
Test Area Under ROC = 0.7334905660377359
