In [None]:
%matplotlib widget
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.interpolate import make_interp_spline
import pyspark.sql.functions as F

best_rated_data = df_best_rated.select("review_year_month", "avg_rating").collect()
most_rated_data = df_most_rated.select("review_year_month", "avg_rating").collect()

best_rated_x = pd.to_datetime([row["review_year_month"] for row in best_rated_data], errors='coerce')
best_rated_y = [row["avg_rating"] for row in best_rated_data]

most_rated_x = pd.to_datetime([row["review_year_month"] for row in most_rated_data], errors='coerce')
most_rated_y = [row["avg_rating"] for row in most_rated_data]

best_rated_df = pd.DataFrame({'x': best_rated_x, 'y': best_rated_y})
most_rated_df = pd.DataFrame({'x': most_rated_x, 'y': most_rated_y})

best_rated_df = best_rated_df.dropna()
most_rated_df = most_rated_df.dropna()

best_rated_x_clean = best_rated_df['x'].values
best_rated_y_clean = best_rated_df['y'].values

most_rated_x_clean = most_rated_df['x'].values
most_rated_y_clean = most_rated_df['y'].values

x_new_best = np.linspace(best_rated_x_clean.min().astype('datetime64[s]').astype(int), 
                          best_rated_x_clean.max().astype('datetime64[s]').astype(int), 300)
y_new_best = make_interp_spline(best_rated_x_clean.astype(np.int64) // 10**9, best_rated_y_clean)(x_new_best)

x_new_most = np.linspace(most_rated_x_clean.min().astype('datetime64[s]').astype(int), 
                          most_rated_x_clean.max().astype('datetime64[s]').astype(int), 300)
y_new_most = make_interp_spline(most_rated_x_clean.astype(np.int64) // 10**9, most_rated_y_clean)(x_new_most)

plt.figure(figsize=(12, 6))
plt.plot(pd.to_datetime(x_new_best, unit='s'), y_new_best, color='red', label='Best Rated', linewidth=2)
plt.plot(pd.to_datetime(x_new_most, unit='s'), y_new_most, color='blue', label='Most Rated', linewidth=2)
plt.scatter(best_rated_x_clean, best_rated_y_clean, color='red')
plt.scatter(most_rated_x_clean, most_rated_y_clean, color='blue')
plt.title('Average Ratings Over Time')
plt.xlabel('Month/Year')
plt.ylabel('Average Rating')
plt.xticks(rotation=45)
plt.legend()
plt.grid()
plt.tight_layout()
plt.show()

best_rated_sentiment_data = df_best_rated_sentiment.select("review_year_month", "avg_magnitude").collect()
most_rated_sentiment_data = df_most_rated_sentiment.select("review_year_month", "avg_magnitude").collect()

best_rated_sentiment_x = pd.to_datetime([row["review_year_month"] for row in best_rated_sentiment_data], errors='coerce')
best_rated_sentiment_y = [row["avg_magnitude"] for row in best_rated_sentiment_data]

most_rated_sentiment_x = pd.to_datetime([row["review_year_month"] for row in most_rated_sentiment_data], errors='coerce')
most_rated_sentiment_y = [row["avg_magnitude"] for row in most_rated_sentiment_data]

best_rated_sentiment_df = pd.DataFrame({'x': best_rated_sentiment_x, 'y': best_rated_sentiment_y})
most_rated_sentiment_df = pd.DataFrame({'x': most_rated_sentiment_x, 'y': most_rated_sentiment_y})

best_rated_sentiment_df = best_rated_sentiment_df.dropna()
most_rated_sentiment_df = most_rated_sentiment_df.dropna()

best_rated_sentiment_x_clean = best_rated_sentiment_df['x'].values
best_rated_sentiment_y_clean = best_rated_sentiment_df['y'].values

most_rated_sentiment_x_clean = most_rated_sentiment_df['x'].values
most_rated_sentiment_y_clean = most_rated_sentiment_df['y'].values

x_new_best_sentiment = np.linspace(best_rated_sentiment_x_clean.min().astype('datetime64[s]').astype(int), 
                                    best_rated_sentiment_x_clean.max().astype('datetime64[s]').astype(int), 300)
y_new_best_sentiment = make_interp_spline(best_rated_sentiment_x_clean.astype(np.int64) // 10**9, best_rated_sentiment_y_clean)(x_new_best_sentiment)

x_new_most_sentiment = np.linspace(most_rated_sentiment_x_clean.min().astype('datetime64[s]').astype(int), 
                                    most_rated_sentiment_x_clean.max().astype('datetime64[s]').astype(int), 300)
y_new_most_sentiment = make_interp_spline(most_rated_sentiment_x_clean.astype(np.int64) // 10**9, most_rated_sentiment_y_clean)(x_new_most_sentiment)

plt.figure(figsize=(12, 6))
plt.plot(pd.to_datetime(x_new_best_sentiment, unit='s'), y_new_best_sentiment, color='green', label='Best Rated Sentiment', linewidth=2)
plt.plot(pd.to_datetime(x_new_most_sentiment, unit='s'), y_new_most_sentiment, color='orange', label='Most Rated Sentiment', linewidth=2)
plt.scatter(best_rated_sentiment_x_clean, best_rated_sentiment_y_clean, color='green')
plt.scatter(most_rated_sentiment_x_clean, most_rated_sentiment_y_clean, color='orange')
plt.title('Sentiment Magnitude Over Time')
plt.xlabel('Month/Year')
plt.ylabel('Sentiment Magnitude')
plt.xticks(rotation=45)
plt.legend()
plt.grid()
plt.tight_layout()
plt.show()
