In [1]:
import os
import re
import glob
import pickle
import pandas as pd

In [3]:
# Get all posts within the data directory
posts = glob.glob('data/posts/*.p')
posts

['data/posts/dupreezd@live.unc.edu_posts_jz8ejj1lawb5st.p',
 'data/posts/dupreezd@live.unc.edu_posts_jqnyuvgzug4p3.p',
 'data/posts/lexokan@live.unc.edu_posts_j5wwaj87hvu6af.p',
 'data/posts/kimia@live.unc.edu_posts_iqv0bsb3p2i3ch.p',
 'data/posts/dupreezd@live.unc.edu_posts_jkws0l0gvcr7it.p']

In [10]:
def validate_instructor_counts(df):
    '''
    Confirm that a post is either tagged as `instructor` or `student` but not both.
    '''
    is_instructor_counts = df['is_instructor'].value_counts()
    is_student_counts = df['is_student'].value_counts()

#     assert is_student_counts[0] == is_instructor_counts[1]
#     assert is_student_counts[1] == is_instructor_counts[0]
    
def num_nested_dicts(d: dict, column: str):
    '''
    Fuction that will send the number of nested dictionaries with a specified key back to the caller.
    Used with len(list(.)) later on in the data pipeline.
    '''
    if column in d:
        yield d['created']
    for k in d:
        if isinstance(d[k], list) and k == 'column':
            for i in d[k]:
                for j in num_nested_dicts(i):
                    yield j

# Iterate over all posts within a class
for fp in posts:
    # Load each post into a DataFrame and store its networkid
    df = pd.DataFrame(pickle.load(open(fp, "rb")))
    network_id = re.search("posts_(.*).p", fp).group(1)
    
    # Compute different metrics about the class
    df['created'] = pd.to_datetime(df['created'])
    df['num_revisions'] = df['history'].apply(lambda x: len(x))
    df['subject'] = df['history'].apply(lambda x: x[0]['subject'])
    df['is_student'] = df['tags'].apply(lambda x: 'student' in x)
    df['is_instructor'] = df['tags'].apply(lambda x: 'instructor-note' in x)
    df['is_announcement'] = df['config'].apply(lambda x: 1 if 'is_announcement' in x else 0)
    df['num_children'] = df['children'].apply(lambda x: len(list(num_nested_dicts(x[0], 'children'))) if len(x) > 0 else 0)

    # Remove HTML from text column
    df['text'] = df['history'].apply(lambda x: re.sub('<[^<]+?>|\n', ' ', x[0]['content']))
    
    validate_instructor_counts(df)
    
    # Reorder the columns
    df = df[['id', 'created', 'type', 'folders', 'tags', 'is_announcement', 'history', 'children', 'tag_good', 'is_student', 'no_answer', 'num_children', 'num_favorites', 'num_revisions', 'unique_views', 'subject','text']]
    
    # Pickle the transformed DataFrame and save
    with open(f"data/dataframes/{fp[11:]}_dataframe_{network_id}.p", 'wb') as f:
            pickle.dump(df, f)

dupreezd@live.unc.edu_posts_jz8ejj1lawb5st.p
dupreezd@live.unc.edu_posts_jqnyuvgzug4p3.p
lexokan@live.unc.edu_posts_j5wwaj87hvu6af.p
kimia@live.unc.edu_posts_iqv0bsb3p2i3ch.p
dupreezd@live.unc.edu_posts_jkws0l0gvcr7it.p


In [None]:
# Remap to have the correct email in front of each 