In [1]:
import json
import ijson
import pandas as pd
from collections import defaultdict


data_path = 'data/events.json'

# Create an empty list to store the filtered entries
filtered_entries = []

with open(data_path, 'r') as f:
    for obj in ijson.items(f, 'item'):
        if 'event_name' in obj and obj['event_name'] in ['problem_passed', 'problem_run']:
            filtered_entries.append(obj)

# Convert the filtered entries into a DataFrame
df = pd.DataFrame(filtered_entries)

In [2]:
import json
import re

# Function to extract challenge and problem from the JSON string
def extract_challenge_problem(event_data_str):
    try:
        event_data = json.loads(event_data_str)
        url = event_data.get("url", "")
        # Extract using regex
        challenge_match = re.search(r'learn\/([^\/]+)', url)
        problem_match = re.search(r'learn\/[^\/]+\/([^\/]+)', url)
        
        challenge = challenge_match.group(1) if challenge_match else None
        problem = problem_match.group(1) if problem_match else None
        
        return challenge, problem
    except (json.JSONDecodeError, AttributeError):
        return None, None

# Apply to the dataframe
df['challenge'], df['problem'] = zip(*df['event_data'].apply(extract_challenge_problem))

# View the updated dataframe
print(df[['event_data', 'challenge', 'problem']].head())


# Save the DataFrame to a CSV file
df.to_csv('data/processed_data.csv', index=False)


                                          event_data               challenge  \
0  {"problem_status":null,"url":"https:\/\/grokle...  challenge-newbies-2018   
1  {"problem_status":0,"url":"https:\/\/groklearn...  challenge-newbies-2018   
2  {"problem_status":null,"url":"https:\/\/grokle...  challenge-newbies-2018   
3  {"problem_status":1,"url":"https:\/\/groklearn...  challenge-newbies-2018   
4  {"problem_status":1,"url":"https:\/\/groklearn...  challenge-newbies-2018   

  problem  
0    w1p2  
1    w1p2  
2    w1p2  
3    w1p2  
4    w1p2  
