In [None]:
#!/usr/bin/env python
# PySpark implementation of recommendation models for MovieLens 32M

import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.sql.functions import col
from pyspark.sql.types import FloatType, IntegerType
import pickle
from scipy import sparse
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Initialize Spark session - adjust memory settings based on your system
def init_spark(app_name="MovieLens_Recommender", memory="5g"):
    """Initialize a Spark session with specified memory allocation"""
    spark = SparkSession.builder \
        .appName(app_name) \
        .config("spark.driver.memory", memory) \
        .config("spark.executor.memory", memory) \
        .config("spark.sql.session.timeZone", "UTC") \
        .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
        .getOrCreate()
    
    # Set log level to reduce verbosity
    spark.sparkContext.setLogLevel("ERROR")
    
    return spark

# Load data using PySpark
def load_data(spark, data_dir="ml-32m", min_ratings=5):
    """
    Load the MovieLens dataset into Spark DataFrames
    """
    print("Loading data with PySpark...")
    start_time = time.time()
    
    # Load ratings
    ratings_df = spark.read.csv(
        os.path.join(data_dir, 'ratings.csv'),
        header=True,
        inferSchema=True
    )
    
    # Load movies
    movies_df = spark.read.csv(
        os.path.join(data_dir, 'movies.csv'),
        header=True,
        inferSchema=True
    )
    
    # Filter users and movies with minimum ratings
    user_counts = ratings_df.groupBy("userId").count().filter(col("count") >= min_ratings)
    active_users = user_counts.select("userId")
    
    movie_counts = ratings_df.groupBy("movieId").count().filter(col("count") >= min_ratings)
    active_movies = movie_counts.select("movieId")
    
    # Join to get filtered ratings
    filtered_ratings = ratings_df.join(active_users, "userId") \
                               .join(active_movies, "movieId")
    
    # Get stats about the filtered dataset
    n_users = active_users.count()
    n_movies = active_movies.count()
    n_ratings = filtered_ratings.count()
    
    print(f"Data loaded in {time.time() - start_time:.2f} seconds")
    print(f"Filtered dataset: {n_users:,} users, {n_movies:,} movies, {n_ratings:,} ratings")
    
    return filtered_ratings, movies_df, n_users, n_movies

# Convert movie genres to feature columns
def process_movie_features(spark, movies_df):
    """Extract and process movie features including genres"""
    
    # Extract year from title
    movies_df = movies_df.withColumn(
        "year", 
        movies_df["title"].substr(-5, 4).cast(IntegerType())
    )
    
    # Get all unique genres
    all_genres = set()
    for genres in movies_df.select("genres").rdd.flatMap(lambda x: x).collect():
        all_genres.update(genres.split('|'))
    
    if '(no genres listed)' in all_genres:
        all_genres.remove('(no genres listed)')
    
    # Create genre feature columns
    for genre in all_genres:
        movies_df = movies_df.withColumn(
            f"genre_{genre}", 
            (col("genres").contains(genre)).cast(IntegerType())
        )
    
    return movies_df

# Split data into train, validation, and test sets
def train_val_test_split(ratings_df, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15, seed=42):
    """Split ratings into training, validation, and test sets"""
    # Ensure ratios sum to 1
    assert abs(train_ratio + val_ratio + test_ratio - 1.0) < 1e-6
    
    # Split the data
    train_data, temp_data = ratings_df.randomSplit([train_ratio, val_ratio + test_ratio], seed=seed)
    
    # Adjust validation ratio
    val_adjusted_ratio = val_ratio / (val_ratio + test_ratio)
    val_data, test_data = temp_data.randomSplit([val_adjusted_ratio, 1.0 - val_adjusted_ratio], seed=seed)
    
    # Cache the datasets
    train_data.cache()
    val_data.cache()
    test_data.cache()
    
    # Display split sizes
    train_count = train_data.count()
    val_count = val_data.count()
    test_count = test_data.count()
    total = train_count + val_count + test_count
    
    print(f"Training set: {train_count:,} ratings ({train_count/total*100:.2f}%)")
    print(f"Validation set: {val_count:,} ratings ({val_count/total*100:.2f}%)")
    print(f"Test set: {test_count:,} ratings ({test_count/total*100:.2f}%)")
    
    return train_data, val_data, test_data

# Function to calculate and print evaluation metrics
def evaluate_model(predictions, truth_col="rating", pred_col="prediction"):
    """Calculate and print RMSE and MAE for model predictions"""
    evaluator_rmse = RegressionEvaluator(
        metricName="rmse", 
        labelCol=truth_col, 
        predictionCol=pred_col
    )
    
    evaluator_mae = RegressionEvaluator(
        metricName="mae", 
        labelCol=truth_col, 
        predictionCol=pred_col
    )
    
    rmse = evaluator_rmse.evaluate(predictions)
    mae = evaluator_mae.evaluate(predictions)
    
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")
    
    return {"RMSE": rmse, "MAE": mae}

# Baseline model: Global mean and bias model
def train_baseline_models(train_data, val_data):
    """Train and evaluate baseline models"""
    start_time = time.time()
    
    # Global mean model
    global_mean = train_data.select("rating").agg({"rating": "avg"}).collect()[0][0]
    print(f"Global mean rating: {global_mean:.4f}")
    
    # Add global mean prediction column to validation data
    val_global_mean = val_data.withColumn("prediction", lit(global_mean))
    
    # Evaluate global mean model
    print("Global Mean Model Performance:")
    global_mean_metrics = evaluate_model(val_global_mean)
    
    # Bias model: Calculate user and item biases
    user_means = train_data.groupBy("userId").agg({"rating": "avg"}).withColumnRenamed("avg(rating)", "user_mean")
    item_means = train_data.groupBy("movieId").agg({"rating": "avg"}).withColumnRenamed("avg(rating)", "item_mean")
    
    # Calculate biases (differences from global mean)
    user_biases = user_means.withColumn("user_bias", col("user_mean") - global_mean)
    item_biases = item_means.withColumn("item_bias", col("item_mean") - global_mean)
    
    # Add predictions to validation data
    val_with_user = val_data.join(user_biases, "userId", "left")
    val_with_user_item = val_with_user.join(item_biases, "movieId", "left")
    
    # Fill missing biases with 0
    val_with_user_item = val_with_user_item.na.fill({
        "user_bias": 0.0,
        "item_bias": 0.0
    })
    
    # Calculate bias prediction
    val_with_predictions = val_with_user_item.withColumn(
        "prediction", 
        global_mean + col("user_bias") + col("item_bias")
    )
    
    # Evaluate bias model
    print("\nBias Model Performance:")
    bias_model_metrics = evaluate_model(val_with_predictions)
    
    print(f"Baseline models trained and evaluated in {time.time() - start_time:.2f} seconds")
    
    return global_mean_metrics, bias_model_metrics, global_mean, user_biases, item_biases

# Train ALS model with PySpark
def train_als_model(train_data, val_data, rank=50, regParam=0.1, maxIter=10, seed=42):
    """Train an ALS model with specified parameters"""
    print(f"Training ALS model (rank={rank}, regParam={regParam}, maxIter={maxIter})...")
    start_time = time.time()
    
    # Create an ALS model
    als = ALS(
        rank=rank,
        maxIter=maxIter,
        regParam=regParam,
        userCol="userId",
        itemCol="movieId",
        ratingCol="rating",
        coldStartStrategy="drop",
        seed=seed,
        nonnegative=True  # Enforce non-negative factors
    )
    
    # Train the model
    model = als.fit(train_data)
    
    # Make predictions on validation data
    predictions = model.transform(val_data)
    
    # Evaluate model
    print("ALS Model Performance:")
    als_metrics = evaluate_model(predictions)
    
    print(f"ALS model trained in {time.time() - start_time:.2f} seconds")
    
    return model, als_metrics

# Function to tune ALS parameters
def tune_als_model(train_data, val_data, ranks=[10, 50, 100], regParams=[0.01, 0.1, 1.0], maxIters=[5, 10]):
    """Find the best ALS parameters through grid search"""
    print("Tuning ALS model parameters...")
    start_time = time.time()
    
    best_rmse = float('inf')
    best_params = None
    best_model = None
    
    # Create evaluator
    evaluator = RegressionEvaluator(
        metricName="rmse", 
        labelCol="rating", 
        predictionCol="prediction"
    )
    
    results = []
    
    # Grid search
    for rank in ranks:
        for regParam in regParams:
            for maxIter in maxIters:
                print(f"Trying rank={rank}, regParam={regParam}, maxIter={maxIter}")
                
                # Create ALS model
                als = ALS(
                    rank=rank,
                    maxIter=maxIter,
                    regParam=regParam,
                    userCol="userId",
                    itemCol="movieId",
                    ratingCol="rating",
                    coldStartStrategy="drop",
                    seed=42,
                    nonnegative=True
                )
                
                # Train the model
                model = als.fit(train_data)
                
                # Make predictions on validation data
                predictions = model.transform(val_data)
                
                # Calculate RMSE
                rmse = evaluator.evaluate(predictions)
                
                # Create MAE evaluator
                evaluator_mae = RegressionEvaluator(
                    metricName="mae", 
                    labelCol="rating", 
                    predictionCol="prediction"
                )
                mae = evaluator_mae.evaluate(predictions)
                
                results.append({
                    'rank': rank,
                    'regParam': regParam,
                    'maxIter': maxIter,
                    'RMSE': rmse,
                    'MAE': mae
                })
                
                print(f"RMSE: {rmse:.4f}, MAE: {mae:.4f}")
                
                # Check if this model is better
                if rmse < best_rmse:
                    best_rmse = rmse
                    best_params = (rank, regParam, maxIter)
                    best_model = model
    
    # Create results dataframe
    results_df = pd.DataFrame(results)
    
    # Print best parameters
    best_rank, best_regParam, best_maxIter = best_params
    print(f"\nBest parameters: rank={best_rank}, regParam={best_regParam}, maxIter={best_maxIter}")
    print(f"Best RMSE: {best_rmse:.4f}")
    
    print(f"ALS parameter tuning completed in {time.time() - start_time:.2f} seconds")
    
    # Return best model and results
    return best_model, results_df

# Function for generating recommendations
def generate_recommendations(model, movies_df, user_id, spark, n=10):
    """Generate movie recommendations for a specific user"""
    # Get top N movie recommendations
    user_recs = model.recommendForUserSubset(spark.createDataFrame([(user_id,)], ["userId"]), n)    
    # Extract recommendations
    if not user_recs.isEmpty():
        recs = user_recs.collect()[0].recommendations
        rec_movies = [(rec.movieId, rec.rating) for rec in recs]
        
        # Get movie details
        rec_movie_ids = [rec[0] for rec in rec_movies]
        movie_details = movies_df.filter(col("movieId").isin(rec_movie_ids)).collect()
        
        # Create a dictionary for movie lookup
        movie_dict = {movie.movieId: movie for movie in movie_details}
        
        # Format recommendations
        formatted_recs = []
        for movie_id, rating in rec_movies:
            if movie_id in movie_dict:
                movie = movie_dict[movie_id]
                formatted_recs.append({
                    "movieId": movie_id,
                    "title": movie.title,
                    "genres": movie.genres,
                    "predicted_rating": rating
                })
        
        return formatted_recs
    else:
        return []

# Function to save model and artifacts
def save_model_artifacts(model, output_dir="models"):
    """Save model and relevant artifacts for later use"""
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Save ALS model with overwrite option
    model_path = os.path.join(output_dir, "als_model")
    model.write().overwrite().save(model_path)
    
    print(f"Model saved to {model_path}")
    
    return model_path

# Main function to run the entire pipeline
def main():
    # Initialize Spark
    spark = init_spark(memory="6g")  # Adjust memory based on your system
    
    try:
        # Load and prepare data
        ratings_df, movies_df, n_users, n_movies = load_data(spark, "ml-32m", min_ratings=5)
        
        # Process movie features
        movies_df = process_movie_features(spark, movies_df)
        
        # Split data
        train_data, val_data, test_data = train_val_test_split(ratings_df)
        
        # Train baseline models
        global_mean_metrics, bias_model_metrics, global_mean, user_biases, item_biases = train_baseline_models(train_data, val_data)
        
        # Train ALS model
        als_model, als_metrics = train_als_model(train_data, val_data, rank=50, regParam=0.1, maxIter=10)
        
        # Save model
        model_path = save_model_artifacts(als_model)
        
        # Evaluate on test set
        test_predictions = als_model.transform(test_data)
        print("\nALS Model Performance on Test Set:")
        test_metrics = evaluate_model(test_predictions)
        
        # Generate example recommendations
        sample_user_id = ratings_df.select("userId").distinct().limit(1).collect()[0][0]
        print(f"\nExample recommendations for user {sample_user_id}:")
        recs = generate_recommendations(als_model, movies_df, sample_user_id, spark, n=5)
        for i, rec in enumerate(recs):
            print(f"{i+1}. {rec['title']} - Predicted rating: {rec['predicted_rating']:.2f}")
        
        # Collect and print all metrics
        all_metrics = {
            "Global Mean": global_mean_metrics,
            "Bias Model": bias_model_metrics,
            "ALS": als_metrics,
            "ALS (Test)": test_metrics
        }
        
        # Convert to DataFrame
        metrics_df = pd.DataFrame.from_dict(all_metrics, orient='index')
        print("\nModel Performance Summary:")
        print(metrics_df)
        
        # Save metrics
        metrics_df.to_csv("model_metrics.csv")
        
        print("\nRecommendation model pipeline completed successfully!")
        
    finally:
        # Stop Spark session
        spark.stop()

if __name__ == "__main__":
    from pyspark.sql.functions import lit
    main()

25/04/13 16:33:59 WARN Utils: Your hostname, lenovo-server resolves to a loopback address: 127.0.1.1; using 192.168.100.30 instead (on interface eno1)
25/04/13 16:33:59 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/13 16:33:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/04/13 16:34:00 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


Loading data with PySpark...


                                                                                

Data loaded in 40.93 seconds
Filtered dataset: 200,948 users, 43,884 movies, 31,921,467 ratings


                                                                                

Training set: 22,343,094 ratings (69.99%)
Validation set: 4,787,886 ratings (15.00%)
Test set: 4,790,487 ratings (15.01%)
Global mean rating: 3.5421
Global Mean Model Performance:


                                                                                

RMSE: 1.0590
MAE: 0.8381

Bias Model Performance:


                                                                                

RMSE: 0.8788
MAE: 0.6680
Baseline models trained and evaluated in 17.70 seconds
Training ALS model (rank=50, regParam=0.1, maxIter=10)...


                                                                                

ALS Model Performance:


                                                                                

RMSE: 0.8075
MAE: 0.6276
ALS model trained in 184.41 seconds


                                                                                

Model saved to models/als_model

ALS Model Performance on Test Set:


                                                                                

RMSE: 0.8074
MAE: 0.6278


                                                                                


Example recommendations for user 148:


                                                                                

1. Once in a Summer (2006) - Predicted rating: 4.39
2. Great Passage, The (Fune wo amu) (2013) - Predicted rating: 4.27
3. Voice of Silence (2020) - Predicted rating: 4.04
4. Warkop DKI Reborn: Jangkrik Boss! (2016) - Predicted rating: 3.96
5. Dara O'Briain Crowd Tickler (2015) - Predicted rating: 3.88

Model Performance Summary:
                 RMSE       MAE
Global Mean  1.059020  0.838117
Bias Model   0.878832  0.668020
ALS          0.807510  0.627639
ALS (Test)   0.807448  0.627759

Recommendation model pipeline completed successfully!
