#### Names of people in the group

Please write the names of the people in your group in the next cell.

Anne Torgersen

Aaryan Neupane

In [None]:
# Loading modules that we need
from pyspark.sql.session import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

# Add your imports below this line

In [None]:
# A helper function to load a table (stored in Parquet format) from DBFS as a Spark DataFrame 
def load_df(table_name: "name of the table to load") -> DataFrame:
    return spark.read.parquet(table_name)

users_df = load_df("/user/hive/warehouse/users")
posts_df = load_df("/user/hive/warehouse/posts")

# Uncomment if you need
# comments_df = load_df("/user/hive/warehouse/comments")
# badges_df = load_df("/user/hive/warehouse/badges")

### The Problem: Mining the Interests of Experts

##### User reputation

In [None]:
## Divide the needed data into a user reputation variable
users_df.createOrReplaceTempView("users")
query1 = '''
SELECT Id, Reputation
FROM users
'''
usr_rep = spark.sql(query1)
print(usr_rep.collect())

[Row(Id=-1, Reputation='1'), Row(Id=1, Reputation='101'), Row(Id=2, Reputation='101'), Row(Id=3, Reputation='101'), Row(Id=4, Reputation='101'), Row(Id=5, Reputation='215'), Row(Id=6, Reputation='101'), Row(Id=7, Reputation='101'), Row(Id=8, Reputation='101'), Row(Id=9, Reputation='1102'), Row(Id=10, Reputation='101'), Row(Id=11, Reputation='213'), Row(Id=12, Reputation='101'), Row(Id=14, Reputation='2782'), Row(Id=15, Reputation='101'), Row(Id=16, Reputation='1'), Row(Id=17, Reputation='236'), Row(Id=18, Reputation='101'), Row(Id=19, Reputation='101'), Row(Id=20, Reputation='101'), Row(Id=21, Reputation='5904'), Row(Id=22, Reputation='323'), Row(Id=23, Reputation='101'), Row(Id=24, Reputation='171'), Row(Id=25, Reputation='101'), Row(Id=26, Reputation='2952'), Row(Id=27, Reputation='101'), Row(Id=28, Reputation='101'), Row(Id=29, Reputation='101'), Row(Id=30, Reputation='101'), Row(Id=31, Reputation='1'), Row(Id=32, Reputation='101'), Row(Id=33, Reputation='101'), Row(Id=34, Reputatio

##### User expertise

In [None]:
## Divide the needed data into a user expertise variable
posts_df.createOrReplaceTempView("posts1")
posts_df.createOrReplaceTempView("posts2")
query2 = '''
SELECT posts1.OwnerUserId, collect_list(posts2.Tags) AS Tags
FROM posts1
INNER JOIN posts2 ON posts1.ParentId = posts2.Id
WHERE posts1.PostTypeId = 2
GROUP BY posts1.OwnerUserId
'''
usr_exp = spark.sql(query2)
print(usr_exp.collect())

[Row(OwnerUserId=None, Tags=['<bigdata><scalability><efficiency><performance>', '<data-mining><clustering><octave><k-means><categorical-data>', '<machine-learning><data-mining><python><classification>', '<classification><binary><svm><random-forest><logistic-regression>', '<bigdata><scalability><distributed>', '<algorithms>', '<classification>', '<machine-learning><data-mining><r><logistic-regression><gradient-descent>', '<machine-learning><python><neural-network>', '<classification><svm>', '<classification><svm>', '<machine-learning><classification><svm><accuracy><random-forest>', '<python><tools><version-control>', '<machine-learning><time-series>', '<machine-learning><statistics><career>', '<apache-hadoop><map-reduce>', '<classification><cross-validation>', '<machine-learning><online-learning>', '<dataset>', '<python><nlp><sentiment-analysis>', '<topic-model><lda>', '<bigdata><beginner>', '<bigdata><databases><binary><version-control>', '<neural-network><image-classification><preproc

In [None]:
def get_tag_diversity(tag_list: list):
    resulting_tags = set()
    for tag in tag_list:
        if "><" in tag:
            divided_tags = tag.split("><")
            resulting_tags.update(tag.strip(">").strip("<") for tag in divided_tags)
        else:
            stripped_tag = tag.strip("<>").strip()
            resulting_tags.add(stripped_tag)
    return len(resulting_tags) / 638

li = ['<algorithms>', '<definitions><parallel><distributed>', '<machine-learning><dimensionality-reduction><python>', '<feature-selection><feature-extraction><dimensionality-reduction>', '<parallel><clustering><aws>', '<efficiency><algorithms><parameter>', '<data-cleaning>', '<statistics><dataset>', '<classification><performance>', '<machine-learning><bigdata>', '<recommender-system><similarity>', '<bigdata><data-mining>', '<machine-learning><dataset><class-imbalance>', '<bigdata><statistics><efficiency><scalability>', '<classification><cross-validation>', '<machine-learning><dataset>', '<feature-selection><featurization>']

print(get_tag_diversity(li))

0.04075235109717868


In [None]:
result = list()
for row in usr_exp.collect():
  user = row["OwnerUserId"]
  usr_tags = row["Tags"]
  interest_diversity = get_tag_diversity(usr_tags)
  result.append((user, interest_diversity))


usr_diversity = spark.createDataFrame(result, ["Id", "InterestDiversity"])
print(usr_diversity.collect())

[Row(Id=None, InterestDiversity=0.21473354231974923), Row(Id=9, InterestDiversity=0.009404388714733543), Row(Id=11, InterestDiversity=0.004702194357366771), Row(Id=14, InterestDiversity=0.04075235109717868), Row(Id=17, InterestDiversity=0.012539184952978056), Row(Id=21, InterestDiversity=0.2006269592476489), Row(Id=22, InterestDiversity=0.006269592476489028), Row(Id=24, InterestDiversity=0.004702194357366771), Row(Id=26, InterestDiversity=0.03761755485893417), Row(Id=34, InterestDiversity=0.004702194357366771), Row(Id=36, InterestDiversity=0.003134796238244514), Row(Id=43, InterestDiversity=0.003134796238244514), Row(Id=51, InterestDiversity=0.004702194357366771), Row(Id=52, InterestDiversity=0.001567398119122257), Row(Id=53, InterestDiversity=0.003134796238244514), Row(Id=59, InterestDiversity=0.02664576802507837), Row(Id=62, InterestDiversity=0.009404388714733543), Row(Id=64, InterestDiversity=0.003134796238244514), Row(Id=70, InterestDiversity=0.003134796238244514), Row(Id=75, Inter

##### Create joint dataframe

In [None]:
usr_rep.createOrReplaceTempView("usr_rep")
usr_diversity.createOrReplaceTempView("usr_div")

query3 = '''
SELECT usr_div.Id, usr_rep.Reputation, usr_div.InterestDiversity
FROM usr_rep INNER JOIN usr_div ON usr_div.Id = usr_rep.Id
'''
final_df = spark.sql(query3)

print(final_df.collect())

[Row(Id=9, Reputation='1102', InterestDiversity=0.009404388714733543), Row(Id=11, Reputation='213', InterestDiversity=0.004702194357366771), Row(Id=14, Reputation='2782', InterestDiversity=0.04075235109717868), Row(Id=17, Reputation='236', InterestDiversity=0.012539184952978056), Row(Id=21, Reputation='5904', InterestDiversity=0.2006269592476489), Row(Id=22, Reputation='323', InterestDiversity=0.006269592476489028), Row(Id=24, Reputation='171', InterestDiversity=0.004702194357366771), Row(Id=26, Reputation='2952', InterestDiversity=0.03761755485893417), Row(Id=34, Reputation='173', InterestDiversity=0.004702194357366771), Row(Id=36, Reputation='325', InterestDiversity=0.003134796238244514), Row(Id=43, Reputation='101', InterestDiversity=0.003134796238244514), Row(Id=51, Reputation='176', InterestDiversity=0.004702194357366771), Row(Id=52, Reputation='146', InterestDiversity=0.001567398119122257), Row(Id=53, Reputation='404', InterestDiversity=0.003134796238244514), Row(Id=59, Reputatio

##### Correlation function

In [None]:
def compute_pearsons_r(df: "a DataFrame", col1: "name of column A", col2: "name of column B") -> float:
    df.createOrReplaceTempView("df")
    correlation_coefficient = spark.sql(
        f"SELECT corr({col1}, {col2}) AS correlation FROM df"
    ).first()["correlation"]
    return correlation_coefficient

In [None]:
print(compute_pearsons_r(final_df, "Reputation","InterestDiversity"))

0.7217677648622973


#### Do expert users have specific interests, or do they have general interests? Kindly explain your thought process and rationale behind arriving at your answer

- Expert users likely have general interests. The Pearson correlation coefficient of 0.72 between Reputation points and Interest Diversity scores suggests a strong positive linear relationship. This indicates that as users gain more reputation points (indicating expertise), they also tend to have higher interest diversity scores, implying a broad range of interests rather than specific ones.