In [1]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt

os.chdir('/home/jovyan/shared/2020_06_10_bad_reviewer')

In [2]:
# Load the data views
labelled_txts = pd.read_json('data/labelled_subs_with_posts.jsonl', orient='records', lines=True)
labelled_txts['id'] = labelled_txts['id'].str.split('_').str[1]
labelled_txts = labelled_txts.rename(columns={'id':'uva_peer_assignments_user_id_author', 'data':'text'})

texts_df = pd.read_csv('data/texts_with_peer_reviews.csv', index_col=[0])
data_view_df = pd.read_csv('data/data_view_for_peer_review_analysis.csv', escapechar='\\', on_bad_lines='warn', index_col=[0])
reviewer_df = pd.read_csv('results/reviewer_agg_stats.csv', index_col=[0])
cleared_submissions = pd.read_csv('bin/cleared_submissions.csv')
submissions_df = pd.read_csv('bin/submissions_df.csv', index_col=[0])
reviews_df = pd.read_csv('bin/reviews_df.csv', index_col = [0])

# Rename the columns to disambiguate whether the id represents a reviewer or an author
submissions_df = submissions_df.rename(columns={'uva_peer_assignments_user_id':'uva_peer_assignments_user_id_author'})
reviewer_df = reviewer_df.rename(columns={'uva_peer_assignments_user_id':'uva_peer_assignments_user_id_reviewer'})
reviews_df = reviews_df.rename(columns={'uva_peer_assignments_user_id':'uva_peer_assignments_user_id_reviewer'})
cleared_submissions = cleared_submissions.rename(columns={'uva_peer_assignments_user_id':'uva_peer_assignments_user_id_author'})

# Get the author id into reviews_df
reviews_df = pd.merge(reviews_df, cleared_submissions[['peer_submission_id', 'uva_peer_assignments_user_id_author']], on='peer_submission_id', how="left")
# Note that authors could only be found for 864,073 out of the 1,059,044 reviews.
reviews_df.count()

peer_review_id                                             1084657
uva_peer_assignments_user_id_reviewer                      1084657
peer_review_created_ts                                     1084657
peer_submission_id                                         1084657
peer_assignment_review_schema_part_prompt_score             929706
peer_assignment_review_schema_part_option_score             929706
peer_assignment_review_schema_part_prompt_free_response    1084640
peer_review_part_free_response_text                        1059044
uva_peer_assignments_user_id_author                         864073
dtype: int64

In [None]:
# Find duplicate essays
print(len(labelled_txts))
submissions_df['is_duplicate'] = submissions_df.duplicated(subset=['text'], keep=False)
labelled_txts=submissions_df[['is_duplicate', 'uva_peer_assignments_user_id_author']].merge(labelled_txts, on='uva_peer_assignments_user_id_author', how='inner')
labelled_txts = labelled_txts[labelled_txts['is_duplicate'] == False]
print(len(labelled_txts))

In [None]:
# Get number of words for the labelled texts
labelled_txts['num_words'] = labelled_txts.apply(lambda row: len(row['text'].split()), axis=1)

reviewer_df['total_variance'] = reviewer_df['total_sd']**2
reviewer_df

In [None]:
### Include All Reviewers

def pruneReviewers():
    good_reviewers_df = reviewer_df[reviewer_df['total_variance']>=0][['uva_peer_assignments_user_id_reviewer']]
    good_reviewers_df['good_reviewer'] = True
    return good_reviewers_df

labelled_data = labelled_txts.merge(reviews_df, on='uva_peer_assignments_user_id_author', how='left')
print(len(labelled_txts), len(set(labelled_data['uva_peer_assignments_user_id_author'])))

# essay_scores contains, author id, reviewer id, submission id, and the sum score for each review
essay_scores = labelled_data.groupby(['uva_peer_assignments_user_id_reviewer', 'peer_submission_id', 'uva_peer_assignments_user_id_author']).agg({'peer_assignment_review_schema_part_option_score':'sum'}).reset_index()
# check to make sure it looks valid
print('checking for non-null values\n' , essay_scores.count(), sep='')
print('num of reviews:', len(essay_scores))
print('num of authors:', essay_scores['uva_peer_assignments_user_id_author'].nunique())

good_reviewers = pruneReviewers()
# Merge essay scores with good reviewers and keep only reviews by good reviewers
essay_scores_good_reviewers = good_reviewers.merge(essay_scores, on='uva_peer_assignments_user_id_reviewer', how='right')
essay_scores_good_only = essay_scores_good_reviewers[essay_scores_good_reviewers['good_reviewer'] == True]
# Create a dataframe of submission ids of submissions reviewed by at least two good reviewers
good_reviewers_df = essay_scores_good_only.groupby('peer_submission_id').agg({'good_reviewer':'sum'}).reset_index()
good_reviewers_df = good_reviewers_df[good_reviewers_df['good_reviewer'] >= 2].rename(columns={'good_reviewer':'num_reviews'})
# Merge essay scores with good reviewers in order to remove submissions with less than two good reviewers
essay_scores_good_labelled = essay_scores_good_only.merge(good_reviewers_df, on='peer_submission_id', how='right')
# Collect average scores for each submission
essay_scores_final = essay_scores_good_labelled.groupby(['peer_submission_id', 'uva_peer_assignments_user_id_author']).agg({'peer_assignment_review_schema_part_option_score':'mean'}).reset_index()
corr_df = essay_scores_final[['uva_peer_assignments_user_id_author',
                                  'peer_assignment_review_schema_part_option_score']].drop_duplicates().merge(labelled_txts[['uva_peer_assignments_user_id_author', 
                                                                                                                 'num_words']].drop_duplicates(), on='uva_peer_assignments_user_id_author', how='left')
all_reviewers = essay_scores_final.merge(labelled_txts, on='uva_peer_assignments_user_id_author')[['text','peer_assignment_review_schema_part_option_score']].rename(columns = {'peer_assignment_review_schema_part_option_score':'labels'})
all_reviewers