#### Names of people in the group

Please write the names of the people in your group in the next cell.

Anne Torgersen

Aaryan Neupane

In [None]:
# Loading modules that we need
from pyspark.sql.session import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

# Add your imports below this line

In [None]:
# A helper function to load a table (stored in Parquet format) from DBFS as a Spark DataFrame 
def load_df(table_name: "name of the table to load") -> DataFrame:
    return spark.read.parquet(table_name)

users_df = load_df("/user/hive/warehouse/users")
posts_df = load_df("/user/hive/warehouse/posts")

# Uncomment if you need
# comments_df = load_df("/user/hive/warehouse/comments")
# badges_df = load_df("/user/hive/warehouse/badges")

### The Problem: Mining the Interests of Experts

##### User reputation

In [None]:
## Divide the needed data into a user reputation variable
users_df.createOrReplaceTempView("users")
query1 = '''
SELECT Id, Reputation
FROM users
'''
usr_rep = spark.sql(query1)

##### User expertise

In [None]:
## Divide the needed data into a user expertise variable
posts_df.createOrReplaceTempView("posts1")
posts_df.createOrReplaceTempView("posts2")
query2 = '''
SELECT posts1.OwnerUserId, collect_list(posts2.Tags) AS Tags
FROM posts1
INNER JOIN posts2 ON posts1.ParentId = posts2.Id
WHERE posts1.PostTypeId = 2
GROUP BY posts1.OwnerUserId
'''
usr_exp = spark.sql(query2)

In [None]:
# Helper function to split the tags from the provided format
def get_tag_diversity(tag_list: list):
    resulting_tags = set()
    for tag in tag_list:
        if "><" in tag:
            divided_tags = tag.split("><")
            resulting_tags.update(tag.strip(">").strip("<") for tag in divided_tags)
        else:
            stripped_tag = tag.strip("<>").strip()
            resulting_tags.add(stripped_tag)
    return len(resulting_tags) / 638

In [None]:
# Create a new dataframe with the userId and the users interest diversity score
result = list()
for row in usr_exp.collect():
  user = row["OwnerUserId"]
  usr_tags = row["Tags"]
  interest_diversity = get_tag_diversity(usr_tags)
  result.append((user, interest_diversity))

usr_diversity = spark.createDataFrame(result, ["Id", "InterestDiversity"])

##### Create joint dataframe

In [None]:
# Creating a joint dataframe with the users id, reputation and interest diversity score
usr_rep.createOrReplaceTempView("usr_rep")
usr_diversity.createOrReplaceTempView("usr_div")

query3 = '''
SELECT usr_div.Id, usr_rep.Reputation, usr_div.InterestDiversity
FROM usr_rep INNER JOIN usr_div ON usr_div.Id = usr_rep.Id
'''
final_df = spark.sql(query3)

##### Correlation function

In [None]:
def compute_pearsons_r(df, col1, col2):
    correlation_coefficient = df.select(corr(col(col1), col(col2))).first()[0]
    return correlation_coefficient

print(compute_pearsons_r(final_df, "Reputation","InterestDiversity"))

0.7217677648622973


Do expert users have specific interests, or do they have general interests? 
Kindly explain your thought process and rationale behind arriving at your answer

- The correlation coefficient of 0.72 indicates a strong positive relationship between Reputation points and Interest Diversity scores among users. This suggests that as users gain more reputation points (indicating expertise), they also tend to have higher interest diversity scores, indicating a broad range of interests rather than specific ones.