## Importing Libraries

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

**1341. Movie Rating (Medium)**

**Table: Movies**

| Column Name   | Type    |
|---------------|---------|
| movie_id      | int     |
| title         | varchar |

movie_id is the primary key (column with unique values) for this table.
title is the name of the movie.
 
**Table: Users**

| Column Name   | Type    |
|---------------|---------|
| user_id       | int     |
| name          | varchar |

user_id is the primary key (column with unique values) for this table.
The column 'name' has unique values.
Table: MovieRating

| Column Name   | Type    |
|---------------|---------|
| movie_id      | int     |
| user_id       | int     |
| rating        | int     |
| created_at    | date    |

(movie_id, user_id) is the primary key (column with unique values) for this table.
This table contains the rating of a movie by a user in their review.
created_at is the user's review date. 
 
**Write a solution to:**
Find the name of the user who has rated the greatest number of movies. In case of a tie, return the lexicographically smaller user name.
Find the movie name with the highest average rating in February 2020. In case of a tie, return the lexicographically smaller movie name.
The result format is in the following example.

**Example 1:**

**Input:**
**Movies table:**

| movie_id    |  title       |
|-------------|--------------|
| 1           | Avengers     |
| 2           | Frozen 2     |
| 3           | Joker        |

**Users table:**

| user_id     |  name        |
|-------------|--------------|
| 1           | Daniel       |
| 2           | Monica       |
| 3           | Maria        |
| 4           | James        |

**MovieRating table:**

| movie_id    | user_id      | rating       | created_at  |
|-------------|--------------|--------------|-------------|
| 1           | 1            | 3            | 2020-01-12  |
| 1           | 2            | 4            | 2020-02-11  |
| 1           | 3            | 2            | 2020-02-12  |
| 1           | 4            | 1            | 2020-01-01  |
| 2           | 1            | 5            | 2020-02-17  | 
| 2           | 2            | 2            | 2020-02-01  | 
| 2           | 3            | 2            | 2020-03-01  |
| 3           | 1            | 3            | 2020-02-22  | 
| 3           | 2            | 4            | 2020-02-25  | 

**Output:**
| results      |
|--------------|
| Daniel       |
| Frozen 2     |

**Explanation:**
Daniel and Monica have rated 3 movies ("Avengers", "Frozen 2" and "Joker") but Daniel is smaller lexicographically.
Frozen 2 and Joker have a rating average of 3.5 in February but Frozen 2 is smaller lexicographically.

In [0]:
movies_data_1341 = [
    (1, "Avengers"),
    (2, "Frozen 2"),
    (3, "Joker"),
]

movies_columns_1341 = ["movie_id", "title"]
movies_df_1341 = spark.createDataFrame(movies_data_1341, movies_columns_1341)
movies_df_1341.show()

users_data_1341 = [
    (1, "Daniel"),
    (2, "Monica"),
    (3, "Maria"),
    (4, "James"),
]

users_columns_1341 = ["user_id", "name"]
users_df_1341 = spark.createDataFrame(users_data_1341, users_columns_1341)
users_df_1341.show()

movie_ratings_data_1341 = [
    (1, 1, 3, "2020-01-12"),
    (1, 2, 4, "2020-02-11"),
    (1, 3, 2, "2020-02-12"),
    (1, 4, 1, "2020-01-01"),
    (2, 1, 5, "2020-02-17"),
    (2, 2, 2, "2020-02-01"),
    (2, 3, 2, "2020-03-01"),
    (3, 1, 3, "2020-02-22"),
    (3, 2, 4, "2020-02-25"),
]

ratings_columns_1341 = ["movie_id", "user_id", "rating", "created_at"]
ratings_df_1341 = spark.createDataFrame(movie_ratings_data_1341, ratings_columns_1341)
ratings_df_1341.show()

+--------+--------+
|movie_id|   title|
+--------+--------+
|       1|Avengers|
|       2|Frozen 2|
|       3|   Joker|
+--------+--------+

+-------+------+
|user_id|  name|
+-------+------+
|      1|Daniel|
|      2|Monica|
|      3| Maria|
|      4| James|
+-------+------+

+--------+-------+------+----------+
|movie_id|user_id|rating|created_at|
+--------+-------+------+----------+
|       1|      1|     3|2020-01-12|
|       1|      2|     4|2020-02-11|
|       1|      3|     2|2020-02-12|
|       1|      4|     1|2020-01-01|
|       2|      1|     5|2020-02-17|
|       2|      2|     2|2020-02-01|
|       2|      3|     2|2020-03-01|
|       3|      1|     3|2020-02-22|
|       3|      2|     4|2020-02-25|
+--------+-------+------+----------+



In [0]:
user_ratings_df_1341 = ratings_df_1341.groupBy("user_id").agg(count("*").alias("rating_count"))

In [0]:
user_with_names_df_1341 = user_ratings_df_1341.join(users_df_1341, on="user_id")

In [0]:
user_window = Window.orderBy(desc("rating_count"), asc("name"))

In [0]:
top_user_df_1341 = user_with_names_df_1341\
                        .withColumn("rank", row_number().over(user_window)) \
                            .filter(col("rank") == 1) \
                                .select("name")



In [0]:
feb_ratings_1341 = ratings_df_1341\
                            .filter(
                                (month("created_at") == 2) & (year("created_at") == 2020)
                                )

In [0]:
avg_rating_per_movie_1341 = feb_ratings_1341\
                                .groupBy("movie_id") \
                                    .agg(avg("rating").alias("avg_rating"))

In [0]:
movie_with_titles_df_1341 = avg_rating_per_movie_1341.join(movies_df_1341, on="movie_id")

In [0]:
movie_window = Window.orderBy(desc("avg_rating"), asc("title"))

In [0]:
top_movie_df_1341 = movie_with_titles_df_1341\
                        .withColumn("rank", row_number().over(movie_window)) \
                            .filter(col("rank") == 1) \
                                .select("title")



In [0]:
top_user_df_1341\
    .union(top_movie_df_1341.withColumnRenamed("title", "name")) \
        .withColumnRenamed("name", "results").show()



+--------+
| results|
+--------+
|  Daniel|
|Frozen 2|
+--------+

