# Analyze revisions from 1 bz2

In [1]:
import json
import random

def load_random_jsonl_lines(path, n_lines):
    reservoir = []
    with open(path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i < n_lines:
                reservoir.append(line)
            else:
                # Randomly replace elements with decreasing probability
                j = random.randint(0, i)
                if j < n_lines:
                    reservoir[j] = line
    return [json.loads(line) for line in reservoir]

In [21]:
import pandas as pd

file_path = "../data/wikidata_dumps_20250601/snapshots/wikidatawiki-20250601-pages-meta-history1-p367p411_snapshot.jsonl"

sample = load_random_jsonl_lines(file_path, 1000)

# Suppose sample is a list of dicts
def keep_snapshot_nested(data):
    for item in data:
        # Extract snapshot dict
        snapshot = item.pop('snapshot', None)
        # Flatten the rest
        flat_item = pd.json_normalize(item).iloc[0].to_dict()
        # Put snapshot back as nested dict
        flat_item['snapshot'] = snapshot
        yield flat_item

flat_data = list(keep_snapshot_nested(sample))

df = pd.DataFrame(flat_data)

print(df.head()[0:1])

   revision_id             timestamp    user_id  \
0   1263752288  2020-08-23T12:31:28Z  3077762.0   

                                             comment  \
0  /* wbsetclaim-create:2||1 */ [[Property:P1549]...   

                                            snapshot entity_id  
0  {'type': 'item', 'id': 'Q239', 'labels': {'en'...       NaN  


In [None]:
# Look for reverted edits to see how the items look removed
# check comments that mention "restore" to see if this are the ones that restore an entity that was completely removed
restore_comments = df[df['comment'].str.contains('create', case=False, na=False)]
restore_comments.head()

Unnamed: 0,revision_id,timestamp,user_id,comment,snapshot,entity_id
23,48870819,2013-06-01T06:46:49Z,53779.0,Reverted edits by [[Special:Contributions/Haza...,"{'type': 'item', 'id': 'Q237', 'labels': {'en'...",Q237
100,1002456073,2019-08-25T20:58:02Z,268661.0,Reverted edits by [[Special:Contributions/191....,"{'type': 'item', 'id': 'Q255', 'labels': {'en'...",
129,943026682,2019-05-16T14:13:00Z,3052725.0,/* undo:0||941925935|Jaijal */ Revert Wordlift...,"{'type': 'item', 'id': 'Q225', 'labels': {'en'...",Q225
156,18085362,2013-04-01T13:04:46Z,5138.0,Reverted edits by [[Special:Contributions/Serg...,"{'type': 'item', 'id': 'Q240', 'labels': {'en'...",
187,412895380,2016-12-01T12:55:59Z,8831.0,Reverted edits by [[Special:Contributions/190....,"{'type': 'item', 'id': 'Q254', 'labels': {'en'...",Q254


In [24]:
from datetime import datetime

def find_previous_revision_in_large_file(file_path, target_entity_id, target_revision_id):
    relevant_revisions = []
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            # Check if entity_id matches
            if data.get('entity_id') == target_entity_id:
                relevant_revisions.append(data)
    
    # If no revisions found for the entity
    if not relevant_revisions:
        print("No revisions found for entity", target_entity_id)
        return None
    
    # Convert to DataFrame to sort
    df = pd.DataFrame(relevant_revisions)
    
    # Make sure timestamps are datetime for sorting
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df = df.sort_values('timestamp')
    
    # Find the index of the target revision
    target_idx = df.index[df['revision_id'] == target_revision_id].tolist()
    if not target_idx:
        print("Target revision not found")
        return None
    target_idx = target_idx[0]
    
    # Get all revisions before target
    prev_revs = df.loc[:target_idx]
    if prev_revs.empty:
        print("No previous revision found")
        return None
    
    # Return the last revision before target
    prev_revisions = prev_revs.tail(15).to_dict(orient='records')

    for rev in prev_revisions:
        if isinstance(rev.get("timestamp"), pd.Timestamp):
            rev["timestamp"] = rev["timestamp"].isoformat()

    with open("prev_revisions.json", "w", encoding="utf-8") as f:
        json.dump(prev_revisions, f, ensure_ascii=False, indent=2)
    return prev_revisions

# Example usage:
target_entity_id = 'Q225'
target_revision_id = 943026682


prev_rev = find_previous_revision_in_large_file(file_path, target_entity_id, target_revision_id)

print(prev_rev)

[{'revision_id': 939322254, 'entity_id': 'Q225', 'timestamp': '2019-05-11T13:59:41+00:00', 'user_id': 3030127.0, 'comment': '/* wbsetqualifier-add:1| */ [[Property:P585]]: 2012, #quickstatements; [[:toollabs:quickstatements/#/batch/12629|batch #12629]] by [[User:813gan|]]', 'snapshot': {'type': 'item', 'id': 'Q225', 'labels': {'en': {'language': 'en', 'value': 'Bosnia and Herzegovina'}}, 'descriptions': {'en': {'language': 'en', 'value': 'republic in Southeast Europe'}}, 'claims': {'P2924': [{'mainsnak': {'snaktype': 'value', 'property': 'P2924', 'hash': '9b2c11b88806538139da4252cce0f1116133bfa2', 'datavalue': {'value': '4095034', 'type': 'string'}}, 'type': 'statement', 'id': 'Q225$feb9efce-4b3a-d745-c702-1ac4063c60ce', 'rank': 'normal'}], 'P1082': [{'mainsnak': {'snaktype': 'value', 'property': 'P1082', 'hash': '8de2cddac382997e3596ce2b52857deafab84b77', 'datavalue': {'value': {'amount': '+3531159', 'unit': '1'}, 'type': 'quantity'}}, 'type': 'statement', 'qualifiers': {'P585': [{'sn