In [1]:
# PySpark Pearson correlation function for distributed data processing

# Access configuration to GCP Cloud Storage

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, sqrt
from pyspark.sql.types import StructType, StructField, FloatType, IntegerType
from scipy.stats import pearsonr
from itertools import combinations

# 1. Configure the Project ID (not Project Name!!!) as per your GCP Dataproc setup
project_id = 'amishr96-cis415-2025springc'

# 2. Configure Bucket name as per your Google Cloud Storage setup
bucket = 'amishr96_data_for_gcp_labs'

# 3. Configure the path to the movie reviews data file as per your Google Cloud Storage setup
path_to_data_files = "/data_for_assignment/"
movie_reviews_file_name = "movie_ratings (1).csv"
relative_path_to_file = path_to_data_files[1:] + movie_reviews_file_name
full_file_path = "gs://" + bucket + "/" + relative_path_to_file

print(f"ProjectID (and not the Project Name) is: {project_id}")
print(f"Bucket name is: {bucket}")



ProjectID (and not the Project Name) is: amishr96-cis415-2025springc
Bucket name is: amishr96_data_for_gcp_labs


In [2]:
# READ DATA

#In GCP, we can read the file directly from Google Storage
print(f"Reading the file using spark directly from GCS")
spark_df = spark.read.csv(full_file_path, sep=",", header=True, inferSchema = True)
spark_df.describe().show()

Reading the file using spark directly from GCS


25/03/08 04:04:08 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 4:>                                                          (0 + 1) / 1]

+-------+---------+------------------+------------------+------------------+------------------+
|summary|     Name|         Inception|           Titanic|            Avatar|        The Matrix|
+-------+---------+------------------+------------------+------------------+------------------+
|  count|      100|               100|               100|               100|               100|
|   mean|     NULL| 3.113999999999999|2.9800000000000004|3.1839999999999997|2.9620000000000006|
| stddev|     NULL|1.1372499170091743|1.2192894105447922|1.1451893231507992|1.0880229498470788|
|    min|Alexander|               1.0|               1.0|               1.0|               1.0|
|    max|  William|               5.0|               5.0|               4.9|               4.9|
+-------+---------+------------------+------------------+------------------+------------------+



                                                                                

In [3]:
spark_df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Inception: double (nullable = true)
 |-- Titanic: double (nullable = true)
 |-- Avatar: double (nullable = true)
 |-- The Matrix: double (nullable = true)



In [5]:
# If rating columns are string data type, we should change to double type

from pyspark.sql.functions import col

# Convert the Label column to double
spark_df = spark_df.withColumn("Inception", col("Inception").cast("double"))
spark_df = spark_df.withColumn("Titanic", col("Titanic").cast("double"))
spark_df = spark_df.withColumn("Avatar", col("Avatar").cast("double"))
spark_df = spark_df.withColumn("The Matrix", col("The Matrix").cast("double"))

In [6]:
spark_df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Inception: double (nullable = true)
 |-- Titanic: double (nullable = true)
 |-- Avatar: double (nullable = true)
 |-- The Matrix: double (nullable = true)



In [7]:
# Task 1: Your Code Starts Here to collect the ratings as a dictionary
# Hint: In Assignment 3, we have the code that collects the ratings as a dictionary. Can you find it and revise it for this task?
from pyspark.sql.functions import collect_list

# Convert DataFrame to dictionary (user_name -> {movie: rating})
ratings_dict = spark_df.rdd.map(lambda row: (row["Name"], { 
    "Inception": row["Inception"], 
    "Titanic": row["Titanic"], 
    "Avatar": row["Avatar"], 
    "The Matrix": row["The Matrix"]
})).collectAsMap()

print("Ratings Dictionary Sample:", list(ratings_dict.items())[:5])



[Stage 5:>                                                          (0 + 1) / 1]

Ratings Dictionary Sample: [('Rachel', {'Inception': 4.5, 'Titanic': 2.3, 'Avatar': 3.6, 'The Matrix': 3.0}), ('Larry', {'Inception': 2.1, 'Titanic': 4.2, 'Avatar': 2.9, 'The Matrix': 3.3}), ('Edward', {'Inception': 4.7, 'Titanic': 1.1, 'Avatar': 4.8, 'The Matrix': 3.4}), ('Gary', {'Inception': 4.2, 'Titanic': 2.2, 'Avatar': 4.4, 'The Matrix': 3.8}), ('Debra', {'Inception': 1.8, 'Titanic': 4.2, 'Avatar': 3.9, 'The Matrix': 2.7})]


                                                                                

In [8]:
# Task 2: Your Code Starts Here to display the first five entries in the dictionary
print("First five entries in the dictionary:")
for user, ratings in list(ratings_dict.items())[:5]:
    print(f"User: {user}, Ratings: {ratings}")

First five entries in the dictionary:
User: Rachel, Ratings: {'Inception': 4.5, 'Titanic': 2.3, 'Avatar': 3.6, 'The Matrix': 3.0}
User: Larry, Ratings: {'Inception': 2.1, 'Titanic': 4.2, 'Avatar': 2.9, 'The Matrix': 3.3}
User: Edward, Ratings: {'Inception': 4.7, 'Titanic': 1.1, 'Avatar': 4.8, 'The Matrix': 3.4}
User: Gary, Ratings: {'Inception': 4.2, 'Titanic': 2.2, 'Avatar': 4.4, 'The Matrix': 3.8}
User: Debra, Ratings: {'Inception': 1.8, 'Titanic': 4.2, 'Avatar': 3.9, 'The Matrix': 2.7}


In [9]:
# Generate all pairs of users (user_name strings)
user_pairs = list(combinations(ratings_dict.keys(), 2))

print(user_pairs[0])



('Rachel', 'Larry')


In [12]:
from scipy.stats import pearsonr

results = []
for user1_name, user2_name in user_pairs:
    if user1_name in ratings_dict and user2_name in ratings_dict:
        ratings_user1 = list(ratings_dict[user1_name].values())  # Extract numerical ratings
        ratings_user2 = list(ratings_dict[user2_name].values())  # Extract numerical ratings
        
        # Ensure both users have at least two ratings
        if len(ratings_user1) > 1 and len(ratings_user2) > 1:
            correlation, p_value = pearsonr(ratings_user1, ratings_user2)
            results.append((user1_name, user2_name, correlation, p_value))
        else:
            print(f"Skipping pair ({user1_name}, {user2_name}): Not enough data points")
    else:
        print(f"Skipping pair ({user1_name}, {user2_name}): Missing data")

# Display results
for result in results:
    print(f"User1: {result[0]}, User2: {result[1]}, Correlation: {result[2]:.2f}, P-value: {result[3]:.5f}")


User1: Rachel, User2: Larry, Correlation: -0.99, P-value: 0.00754
User1: Rachel, User2: Edward, Correlation: 0.88, P-value: 0.11612
User1: Rachel, User2: Gary, Correlation: 0.82, P-value: 0.18025
User1: Rachel, User2: Debra, Correlation: -0.74, P-value: 0.25565
User1: Rachel, User2: Carol, Correlation: -0.11, P-value: 0.88959
User1: Rachel, User2: Kimberly, Correlation: 0.19, P-value: 0.80790
User1: Rachel, User2: Sarah, Correlation: 0.84, P-value: 0.15975
User1: Rachel, User2: Christine, Correlation: 0.20, P-value: 0.79784
User1: Rachel, User2: Matthew, Correlation: 0.13, P-value: 0.87219
User1: Rachel, User2: Nancy, Correlation: 0.92, P-value: 0.07621
User1: Rachel, User2: Margaret, Correlation: -0.98, P-value: 0.02364
User1: Rachel, User2: Richard, Correlation: 0.51, P-value: 0.49439
User1: Rachel, User2: Robert, Correlation: 0.31, P-value: 0.68560
User1: Rachel, User2: Patricia, Correlation: -0.95, P-value: 0.05093
User1: Rachel, User2: Michelle, Correlation: 0.44, P-value: 0.56495

In [13]:
# Task 3: Your Code Starts Here to print the results in the ascending order of p_values; please only print the results that have p-values less than 0.05
# Task 3: Your Code Starts Here

from scipy.stats import pearsonr

results = []

# Generate all unique pairs of users
user_pairs = list(combinations(ratings_dict.keys(), 2))

for user1_name, user2_name in user_pairs:
    if user1_name in ratings_dict and user2_name in ratings_dict:
        # Extract numerical ratings
        ratings_user1 = list(ratings_dict[user1_name].values())
        ratings_user2 = list(ratings_dict[user2_name].values())

        # Ensure both users have at least two ratings
        if len(ratings_user1) > 1 and len(ratings_user2) > 1:
            correlation, p_value = pearsonr(ratings_user1, ratings_user2)
            
            # Only store results where p-value < 0.05
            if p_value < 0.05:
                results.append((user1_name, user2_name, correlation, p_value))

# Sort results in ascending order of p-value
sorted_results = sorted(results, key=lambda x: x[3])

# Print results
print("Significant Pearson Correlations (p < 0.05):")
for user1, user2, corr, p_val in sorted_results:
    print(f"User1: {user1}, User2: {user2}, Correlation: {corr:.2f}, P-value: {p_val:.5f}")

# Task 3: Your Code Ends Here


Significant Pearson Correlations (p < 0.05):
User1: Mark, User2: Ryan, Correlation: 1.00, P-value: 0.00003
User1: Brian, User2: Karen, Correlation: 1.00, P-value: 0.00047
User1: Joseph, User2: Ruth, Correlation: 1.00, P-value: 0.00057
User1: Rachel, User2: Andrew, Correlation: 1.00, P-value: 0.00061
User1: Matthew, User2: Amy, Correlation: 1.00, P-value: 0.00088
User1: Stephanie, User2: Catherine, Correlation: 1.00, P-value: 0.00103
User1: Patricia, User2: Timothy, Correlation: -1.00, P-value: 0.00132
User1: Charles, User2: Anna, Correlation: -1.00, P-value: 0.00168
User1: Donald, User2: James, Correlation: 1.00, P-value: 0.00232
User1: Kathleen, User2: Stephanie, Correlation: 1.00, P-value: 0.00239
User1: Michelle, User2: Jonathan, Correlation: -1.00, P-value: 0.00257
User1: Angela, User2: Gregory, Correlation: -1.00, P-value: 0.00291
User1: Kathleen, User2: Ashley, Correlation: -1.00, P-value: 0.00296
User1: Ashley, User2: Catherine, Correlation: -1.00, P-value: 0.00297
User1: Janet,

In [14]:
# Task 4: Your Code Starts Here to find out if Helen and Jonathan have similar or opposite preferences to movies based on the correlation coefficient.
# Task 4: Your Code Starts Here

from scipy.stats import pearsonr

# Check if both users exist in the dataset
if "Helen" in ratings_dict and "Jonathan" in ratings_dict:
    # Extract numerical ratings
    ratings_helen = list(ratings_dict["Helen"].values())
    ratings_jonathan = list(ratings_dict["Jonathan"].values())

    # Ensure both users have rated at least two movies
    if len(ratings_helen) > 1 and len(ratings_jonathan) > 1:
        correlation, p_value = pearsonr(ratings_helen, ratings_jonathan)

        # Print results
        print(f"Correlation between Helen and Jonathan: {correlation:.2f} (p={p_value:.5f})")

        # Determine similarity or opposition
        if correlation > 0:
            print("Helen and Jonathan have **similar** movie preferences.")
        elif correlation < 0:
            print("Helen and Jonathan have **opposite** movie preferences.")
        else:
            print("Helen and Jonathan have **no correlation** in their movie preferences.")
    else:
        print("Not enough data for Helen and Jonathan to compute correlation.")
else:
    print("Helen or Jonathan not found in the dataset.")

# Task 4: Your Code Ends Here


Correlation between Helen and Jonathan: -0.95 (p=0.04989)
Helen and Jonathan have **opposite** movie preferences.
