In [None]:
# Loading modules that we need
from pyspark.sql.session import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

# Add your imports below this line

In [None]:
# A helper function to load a table (stored in Parquet format) from DBFS as a Spark DataFrame 
def load_df(table_name: "name of the table to load") -> DataFrame:
    return spark.read.parquet(table_name)

users_df = load_df("/user/hive/warehouse/users")
posts_df = load_df("/user/hive/warehouse/posts")

# Uncomment if you need
# comments_df = load_df("/user/hive/warehouse/comments")
# badges_df = load_df("/user/hive/warehouse/badges")

#### The problem: mining the interests of experts

The primary role of a questions and answering platform such as Stack Exchange is to connect two types of people. Namely, people who have questions in areas such as computer science or data science and knowledgeable people who can answer those questions reliably. Let's call the first category of people' knowledge seekers' and the second one 'expert users' or 'experts' for short.

Here we want to answer a question related to the diversity of topics that experts are interested in using our data. We want to know if expert users only answer questions in a specific set of topics or their interests include a wide variety of topics.

To answer the above question, we will compute the correlation between a user's expertise level and the diversity of topics of questions they have answered. The first step is to define two variables (or measures); first for 'user expertise level' and then for 'user interest diversity'. Then we will use the Pearson correlation coefficient to measure the linear correlation between the two variables. We define the variables as:

   - VariableA (the measure of user expertise level). We will use the 'Reputation' column from 'users' table, which according to Stack Exchange's documentation "is a rough measurement of how much the community trusts you; it is earned by convincing your peers that you know what you're talking about" as an indicator of a user's expertise level on the platform. 

   - VariableB (The measure of user interest diversity). We measure the diversity of a user's interests by computing the total number of distinct tags associated with the questions each user has answered divided by the total number of unique tags which is 638.

Compute the Pearson correlation coefficient between VariableA and VariableB, and based on the result you've got, answer the following question: 

     Do expert users have specific interests, or do they have general interests?

Please explain your reasoning on how you reached your answer.

You should use Apache Spark API for your implementation. You can use the Spark implementation of the Pearson correlation coefficient.

#### Our solution

In order to solve the task, we:
1. Filter `posts_df` into two new DataFrames `questions_df` and `answers_df` based on the post type
2. Join `answers_df` with `questions_df` on the answer's `ParentId`, which points to the parent question, to get the parent's tags for each answer
3. Filter the tags to just the unique tags
4. Get the size of the unique tags for the user's Interest Diversity
5. Join the resulting DataFrame with `users_df` to combine Interest Diversity with the user ID and Reputation
6. Calculate the Pearson correlation coefficient between Reputation and Interest Diversity with `df.corr`

A Pearson correlation coefficient of 0.722 indicates a relatively strong positive correlation between `Interest Diversity` and `Reputation`, indicating that expert users of the platform have somewhat general interest, rather than specific ones. That is, there is a positive correlation between the number of unique tags on posts that users have answered, and the reputation of the user. This indicates that users with high reputation also often are users that have answered posts on multiple topics, which would indicate that expert users (high reputation) have diverse interests.

In [None]:
import pyspark.sql.functions as F
# Importing GraphFrames graph library; make sure you have GraphFrames installed on the cluster
from graphframes import *

TOTAL_UNIQUE_TAGS = 638
# Separate questions and answers into separate DataFrames
questions_df = posts_df.filter(posts_df.PostTypeId == 1).alias("questions")
answers_df = posts_df.filter(posts_df.PostTypeId == 2).alias("answers")

# Join the DataFrames to find the Tags of the parent question for a given answer
answers_with_tags_df = (
    answers_df
    .join(questions_df, col("answers.ParentId") == col("questions.Id"))
    .select(
        col("answers.ParentId").alias("ParentId"),
        col("questions.Tags").alias("Tags"),
        col("answers.OwnerUserId").alias("UserId")
    )
)

# Group all tags by OwnerUserId, flatten the array, and remove duplicate tags from the flattened array
# to produce a list of unique tags for each OwnerUserId, i.e. the set of tags on questions that a given user
# has answered.
interest_df = (
  answers_with_tags_df
  .groupBy("UserId")
  .agg(F.collect_list("Tags").alias("Tags"))
  .select("UserId", F.array_distinct(F.flatten("Tags")).alias("Unique Tags"))
)

# Calculate the interest diversity for each user
interest_df = interest_df.withColumn("Interest Diversity", F.size("Unique Tags") / TOTAL_UNIQUE_TAGS)

# Produce a DataFrame with the Interest Diversity, Reputation, and Id of each user
solution_df = (
  users_df.join(interest_df, interest_df.UserId == users_df.Id)
  .select("UserId", "Reputation", "Interest Diversity")
)
display(solution_df)

# Calculate the Pearson Correlation Coefficient
solution_df.corr("Interest Diversity", "Reputation")

UserId,Reputation,Interest Diversity
9,1102,0.0094043887147335
11,213,0.0047021943573667
14,2782,0.0407523510971786
17,236,0.012539184952978
21,5904,0.2006269592476489
22,323,0.006269592476489
24,171,0.0047021943573667
26,2952,0.0376175548589341
34,173,0.0047021943573667
36,325,0.0031347962382445


Out[75]: 0.7217677648623019