In [12]:
# Must be included at the beginning of each new notebook. Remember to change the app name.
import findspark
findspark.init('/home/ubuntu/spark-3.2.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('basics').getOrCreate()

In [13]:
df = spark.read.csv('data/transformed.csv',inferSchema=True)

In [14]:
# Optionally, rename columns for better clarity
df = df.withColumnRenamed("_c0", "budget") \
       .withColumnRenamed("_c1", "popularity") \
       .withColumnRenamed("_c2", "revenue") \
       .withColumnRenamed("_c3", "runtime") \
       .withColumnRenamed("_c4", "vote_count") \
       .withColumnRenamed("_c5", "genre") \
       .withColumnRenamed("_c6", "release_date") \
       .withColumnRenamed("_c7", "production_country")\
       .withColumnRenamed("_c8", "popularity_rank")\
       .withColumnRenamed("_c9", "risk")
# Let's get an idea of what the data looks like. 
df.printSchema()
df.show()

root
 |-- budget: integer (nullable = true)
 |-- popularity: double (nullable = true)
 |-- revenue: integer (nullable = true)
 |-- runtime: integer (nullable = true)
 |-- vote_count: integer (nullable = true)
 |-- genre: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- production_country: string (nullable = true)
 |-- popularity_rank: string (nullable = true)
 |-- risk: string (nullable = true)

+---------+----------+----------+-------+----------+---------------+------------+------------------+---------------+----+
|   budget|popularity|   revenue|runtime|vote_count|          genre|release_date|production_country|popularity_rank|risk|
+---------+----------+----------+-------+----------+---------------+------------+------------------+---------------+----+
|300000000|139.082615| 961000000|    169|      4500|         Action|  2007-05-19|      united_state|           high| low|
|245000000|107.376788| 880674609|    148|      4466|         Action|  2015-10-26|       

In [15]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


# Convert categorical columns into numerical representations
indexer_risk = StringIndexer(inputCol="risk", outputCol="risk_index")

df = indexer_risk.fit(df).transform(df)

In [5]:
df.show()

+---------+----------+----------+-------+----------+---------------+------------+------------------+---------------+----+----------+
|   budget|popularity|   revenue|runtime|vote_count|          genre|release_date|production_country|popularity_rank|risk|risk_index|
+---------+----------+----------+-------+----------+---------------+------------+------------------+---------------+----+----------+
|300000000|139.082615| 961000000|    169|      4500|         Action|  2007-05-19|      united_state|           high| low|       0.0|
|245000000|107.376788| 880674609|    148|      4466|         Action|  2015-10-26|            others|           high| low|       0.0|
|250000000| 112.31295|1084939099|    165|      9106|         Action|  2012-07-16|      united_state|           high| low|       0.0|
|260000000| 43.926995| 284139100|    132|      2124|         Action|  2012-03-07|      united_state|           high| low|       0.0|
|260000000| 48.681969| 591794936|    100|      3330|      Animation| 

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col, udf, min, max
from pyspark.sql.types import DoubleType

# Initialize Spark Session
spark = SparkSession.builder.appName("LinearRegressionExample").getOrCreate()


# Calculate min and max values for revenue and budget
revenue_min, revenue_max = df.select(min("revenue"), max("revenue")).first()
budget_min, budget_max = df.select(min("budget"), max("budget")).first()

# Define UDF for Min-Max scaling
def min_max_scaling(value, min_value, max_value):
    return (value - min_value) / (max_value - min_value)

# Define UDF for inverse Min-Max scaling
def inverse_min_max_scaling(scaled_value, min_value, max_value):
    return (scaled_value * (max_value - min_value)) + min_value

min_max_scaling_udf = udf(lambda x: min_max_scaling(x, revenue_min, revenue_max), DoubleType())
inverse_min_max_scaling_udf = udf(lambda x: inverse_min_max_scaling(x, budget_min, budget_max), DoubleType())

# Apply Min-Max scaling to revenue and budget columns
df = df.withColumn("revenue_scaled", min_max_scaling_udf(col("revenue")))
df = df.withColumn("budget_scaled", min_max_scaling_udf(col("budget")))

# List of feature column names
all_feature_columns = ['risk_index', 'revenue_scaled']

# Assemble features into a single vector
assembler = VectorAssembler(inputCols=all_feature_columns, outputCol="features")
data = assembler.transform(df)

# Split the data into training and test sets
train_data, test_data = data.randomSplit([0.8, 0.2], seed=1234)

# Create and train the Linear Regression model
lr = LinearRegression(featuresCol="features", labelCol="budget_scaled")
model = lr.fit(train_data)

# Make predictions on the test data
predictions = model.transform(test_data)

# Calculate R-squared
evaluator_r2 = RegressionEvaluator(labelCol="budget_scaled", predictionCol="prediction", metricName="r2")
r2 = evaluator_r2.evaluate(predictions)

print(f"R-squared (R2) = {r2}")

# Print the coefficients and intercept of the model
print(f"Coefficients: {model.coefficients}")
print(f"Intercept: {model.intercept}")

# Stop the Spark session
spark.stop()


In [152]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Initialize Spark Session
spark = SparkSession.builder.appName("LinearRegressionExample").getOrCreate()

# List of feature column names
all_feature_columns = ['risk_index', 'revenue']

# Assemble features into a single vector
assembler = VectorAssembler(inputCols=all_feature_columns, outputCol="features")
data = assembler.transform(df)

# Split the data into training and test sets
train_data, test_data = data.randomSplit([0.8, 0.2], seed=1234)

# Create and train the Linear Regression model
lr = LinearRegression(featuresCol="features", labelCol="budget")
model = lr.fit(train_data)

# Make predictions on the test data
predictions = model.transform(test_data)

# Evaluate the model
evaluator_rmse = RegressionEvaluator(labelCol="budget", predictionCol="prediction", metricName="rmse")
rmse = evaluator_rmse.evaluate(predictions)

# Calculate R-squared
evaluator_r2 = RegressionEvaluator(labelCol="budget", predictionCol="prediction", metricName="r2")
r2 = evaluator_r2.evaluate(predictions)

print(f"Root Mean Squared Error (RMSE) = {rmse}")
print(f"R-squared (R2) = {r2}")

# Print the coefficients and intercept of the model
print(f"Coefficients: {model.coefficients}")
print(f"Intercept: {model.intercept}")

# Print the formula of the linear regression model
feature_coefficients = list(zip(all_feature_columns, model.coefficients))
formula = f"budget = {model.intercept}"
for feature, coef in feature_coefficients:
    formula += f" + ({coef} * {feature})"
print(f"Formula: {formula}")

# Show the predictions
predictions.select("features", "budget", "prediction").show()

# Stop the Spark session
spark.stop()


24/05/21 07:40:21 WARN Instrumentation: [dff30f9b] regParam is zero, which might cause numerical instability and overfitting.


Root Mean Squared Error (RMSE) = 31091547.939098485
R-squared (R2) = 0.6592831157869041
Coefficients: [9801590.630461901,0.1952494728034208]
Intercept: 18863729.323349126
Formula: budget = 18863729.323349126 + (9801590.630461901 * risk_index) + (0.1952494728034208 * revenue)
+-----------------+-------+--------------------+
|         features| budget|          prediction|
+-----------------+-------+--------------------+
|[0.0,1.0178331E7]| 100000| 2.085104308511784E7|
| [0.0,4.266441E7]| 100000| 2.719393288331812E7|
|[0.0,3.3456317E7]| 500000| 2.539605757954325E7|
|   [1.0,171760.0]| 500000| 2.869885600325974E7|
|   [1.0,171760.0]| 500000| 2.869885600325974E7|
|  [0.0,4242978.0]| 500000| 1.969216854096564E7|
|  [0.0,4242978.0]| 500000| 1.969216854096564E7|
|  [0.0,1185783.0]| 852510|1.9095252828958385E7|
|[0.0,1.0047674E7]|1000000|2.0825532374749765E7|
|   [1.0,444575.0]|1000000|2.8752122988182608E7|
|  [0.0,4235151.0]|1000000|1.9690640323342007E7|
|  [0.0,4105187.0]|1100000| 1.96652649

In [16]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Initialize Spark Session
spark = SparkSession.builder.appName("LinearRegressionExample").getOrCreate()

# List of feature column names
all_feature_columns = ['risk_index', 'revenue']

# Assemble features into a single vector
assembler = VectorAssembler(inputCols=all_feature_columns, outputCol="features")
data = assembler.transform(df)

# Split the data into training and test sets
train_data, test_data = data.randomSplit([0.8, 0.2], seed=1234)

# Create and train the Linear Regression model
lr = LinearRegression(featuresCol="features", labelCol="budget")
model = lr.fit(train_data)

# Make predictions on the test data
predictions = model.transform(test_data)

# Calculate R-squared
evaluator_r2 = RegressionEvaluator(labelCol="budget", predictionCol="prediction", metricName="r2")
r2 = evaluator_r2.evaluate(predictions)

print(f"R-squared (R2) = {r2}")

# Stop the Spark session
spark.stop()


24/05/21 07:57:53 WARN Instrumentation: [165f36be] regParam is zero, which might cause numerical instability and overfitting.


R-squared (R2) = 0.6592831157869041
