**Step 1: Install and Set Up PySpark in Colab**

In [1]:
!pip install pyspark




In [13]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator


**Step 2: Load & Preprocess the Dataset**

In [14]:
#Create a spark session
spark = SparkSession.builder.appName("GitHubFile").getOrCreate()

#Load the File from GitHub into PySpark
!wget -O transactions.csv "https://media.githubusercontent.com/media/baariu/recommendation-system/refs/heads/main/transactions.csv"
df = spark.read.csv("transactions.csv", header=True, inferSchema=True)
#Show first 5 rows
df.show(5)

--2025-02-23 14:11:28--  https://media.githubusercontent.com/media/baariu/recommendation-system/refs/heads/main/transactions.csv
Resolving media.githubusercontent.com (media.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to media.githubusercontent.com (media.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 29285744 (28M) [text/plain]
Saving to: ‘transactions.csv’


2025-02-23 14:11:28 (170 MB/s) - ‘transactions.csv’ saved [29285744/29285744]

+-----+-------+-----------+--------------------+
| User|Product|Interaction|         ProductName|
+-----+-------+-----------+--------------------+
|15796|      7|          2|       Fixed Deposit|
|  861|      1|          3|Travel Rewards Cr...|
|76821|      4|          2|           Home Loan|
|54887|      1|          2|Travel Rewards Cr...|
| 6266|      3|          2| Student Credit Card|
+-----+-------+-----------+--------------------+
onl

In [15]:
#Check column data types
df.printSchema()

root
 |-- User: integer (nullable = true)
 |-- Product: integer (nullable = true)
 |-- Interaction: integer (nullable = true)
 |-- ProductName: string (nullable = true)



In [16]:
from pyspark.sql.functions import col

In [17]:
#Rename the columns because using ALS requires these 3 columns. Rating in this case means the frequency the user interacts with the item.
df = df.select(col("User").alias("userId"),
               col("Product").alias("itemId"),
               col("Interaction").alias("rating"))
df.show(5)


+------+------+------+
|userId|itemId|rating|
+------+------+------+
| 15796|     7|     2|
|   861|     1|     3|
| 76821|     4|     2|
| 54887|     1|     2|
|  6266|     3|     2|
+------+------+------+
only showing top 5 rows



**Step 3: Split the Dataset into Training & Test Sets**

In [18]:
# Split the dataset into training and testing
(train_data, test_data) = df.randomSplit([0.8, 0.2], seed=42)


**Step 4: Train the ALS Model**

In [19]:
# Train the ALS model
als = ALS(
    userCol="userId",
    itemCol="itemId",
    ratingCol="rating",
    maxIter=10,
    regParam=0.1,  # Regularization to prevent overfitting
    rank=10,  # Number of latent factors
    coldStartStrategy="drop"  # Remove users/items with no data
)

# Fit the model
model = als.fit(train_data)


**Step 5: Evaluate the Model Using RMSE**

In [20]:
# Evaluate the model
predictions = model.transform(test_data)

evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
    predictionCol="prediction"
)

rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")


Root Mean Squared Error (RMSE): 1.3091


**Step 6: Hyperparameter tuning**

In [22]:
#Tune the model using cross-validation and grid search.
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
# Create ALS model
als = ALS(userCol="userId", itemCol="itemId", ratingCol="rating", coldStartStrategy="drop")

# Define hyperparameter grid
paramGrid = (ParamGridBuilder()
             .addGrid(als.rank, [5, 10, 15])  # Test different ranks
             .addGrid(als.maxIter, [5, 10, 15])  # Try different iteration counts
             .addGrid(als.regParam, [0.01, 0.1, 1])  # Try different regularization values
             .build())


In [23]:
# Define RMSE evaluator
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")

# Use CrossValidator to find the best model
cv = CrossValidator(estimator=als,
                    estimatorParamMaps=paramGrid,
                    evaluator=evaluator,
                    numFolds=3)  # 3-fold cross-validation

**Cross Validation**

In [24]:
# Train using cross-validation
cvModel = cv.fit(train_data)

# Get best model
best_model = cvModel.bestModel

# Extract best hyperparameters
best_rank = best_model.rank
best_regParam = best_model._java_obj.parent().getRegParam()
best_maxIter = best_model._java_obj.parent().getMaxIter()

print(f"Best Rank: {best_rank}")
print(f"Best Regularization Parameter: {best_regParam}")
print(f"Best Max Iterations: {best_maxIter}")


Best Rank: 10
Best Regularization Parameter: 0.1
Best Max Iterations: 5


**Optimized RMSE**

In [25]:
predictions = best_model.transform(test_data)
rmse = evaluator.evaluate(predictions)
print(f"Optimized RMSE: {rmse}")

Optimized RMSE: 1.3045742499642285


**Step 6: Generate Top-3 Recommendations for Each User**

In [27]:
# Generate recommendations
user_recommendations = best_model.recommendForAllUsers(3)
user_recommendations.show(5, truncate=False)


+------+-------------------------------------------------+
|userId|recommendations                                  |
+------+-------------------------------------------------+
|1     |[{7, 3.5469716}, {3, 2.9216666}, {2, 2.7297873}] |
|3     |[{7, 3.2847629}, {10, 3.1602492}, {2, 3.1119378}]|
|5     |[{5, 3.8519988}, {4, 3.6270857}, {2, 3.582632}]  |
|6     |[{6, 2.6512165}, {9, 2.5831306}, {2, 2.541433}]  |
|9     |[{7, 1.8299633}, {1, 1.8292536}, {9, 1.7289177}] |
+------+-------------------------------------------------+
only showing top 5 rows



**Step 7: Save Recommendations to a  dataframe**



In [31]:
#Convert PySpark DataFrame to Pandas
recommendations_pd = user_recommendations.toPandas()

#Save the recommendations directly as a CSV file
recommendations_pd.to_csv("user_recommendations.csv", index=False)

In [32]:
#Download the dataframe with colab’s built-in file downloader
from google.colab import files
files.download("user_recommendations.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>