## Converting XML to Light Dump format

This noteboook contains code that converts XML to light dump. 

In [9]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt

In [7]:
content = []
with open("../data/raw/testdata/enwiki-20200101-page-meta-history2.xml-ptest", encoding = 'utf8') as file:
    
    content = file.readlines()
    content = "".join(content)
    soup = BeautifulSoup(content, "xml")


In [8]:
pages = soup.findAll("page")

In [9]:
data = {}
for page in pages:
    title = page.title.text
    revisions = page.findAll("revision")
    print(title)
    
    for revision in revisions:
        r_id = revision.id.text 
        time = revision.timestamp.text
        try:
            username = revision.contributor.username.text
        except: 
            username = revision.contributor.ip.text
        text = revision.format.next_sibling.next_sibling.text
        if title in data:
            data[title].append([title, r_id, time, username, text])
        else:
            data[title] = [[title, r_id, time, username, text]]

data['Emma of Normandy'][0]

Emma of Normandy
Geosynchronous orbit


['Emma of Normandy',
 '15604',
 '2002-02-21T15:11:49Z',
 'David Parker',
 "[[Emma]] (c.982-1052), daughter of [[Robert I, Duke of Normandy|Robert I]], , duke of [[Normandy]], was twice queen of [[England]], by marriage first (1002-1016) to king [[Ethelred the Unready]] and then (1017-1035) to [[Canute]], king also of [[Denmark]] and [[Norway]]. \n\nUpon the Danish invasion of England in 1013, Emma took her sons by Ethelred, Alfred and [[Edward the Confessor|Edward]] to Normandy, where they remained upon her return to England to marry Canute, now king of England following the death of Ethelred, with whom he had agreed to divide the realm. \n\nFollowing Canute's death, Alfred and Edward returned in 1036 to overthrow Canute's illegitimate son [[Harold Harefoot]], who had established himself as ruler in the absence of [[Harthacanute]], son of Canute and Emma. Alfred was captured and killed, while Edward escaped to Normandy, followed by his mother. \n\nThe death of Harold (1040) and the acc

In [10]:
dframes = []
for page in data:

    df = pd.DataFrame(data[page], columns = ['title', 'id', 'time', 'username', 'text'])
    
    hist = [] #history of text
    version = [] #edit version
    username = []
    revert = [] #0 or 1
    curr = 1 #to keep track of version
    
    for idx, row in df.iterrows():
        if row.text not in hist: # not a revert
            hist.append(row.text)
            version.append(curr)
            username.append(row.username)
            revert.append('0')
            curr += 1
        else: #is revert
            temp = hist.index(row.text)
            version.append(version[temp])
            username.append(row.username)
            
            #if self revert
            if row.username == username[version[temp]]:
                revert.append('0')
            else:
                revert.append('1')
            
    
    df['version'] = version
    df['revert'] = revert
    dframes.append(df)

for df in dframes:
    display(df.iloc[64:69])

Unnamed: 0,title,id,time,username,text,version,revert
64,Emma of Normandy,98345559,2007-01-04T04:47:49Z,Ekotkie,[[Image:British.Library.MS.Add.33241.jpg|right...,65,0
65,Emma of Normandy,102514893,2007-01-22T21:18:48Z,DBD,[[Image:British.Library.MS.Add.33241.jpg|right...,66,0
66,Emma of Normandy,107675506,2007-02-12T22:46:12Z,74.241.147.48,ǖ[[Image:British.Library.MS.Add.33241.jpg|righ...,67,0
67,Emma of Normandy,107680159,2007-02-12T23:05:20Z,Henrygb,[[Image:British.Library.MS.Add.33241.jpg|right...,66,1
68,Emma of Normandy,117011387,2007-03-22T12:14:54Z,Dearagon,[[Image:British.Library.MS.Add.33241.jpg|right...,68,0


Unnamed: 0,title,id,time,username,text,version,revert
64,Geosynchronous orbit,12625020,2005-04-10T19:14:29Z,Urhixidur,A '''geosynchronous orbit''' is a geocentric [...,61,0
65,Geosynchronous orbit,12726241,2005-04-21T19:15:08Z,Hackwrench,A '''geosynchronous orbit''' is a geocentric [...,62,0
66,Geosynchronous orbit,12854697,2005-04-23T22:29:46Z,217.86.49.66,A '''geosynchronous orbit''' is a geocentric [...,63,0
67,Geosynchronous orbit,12944263,2005-04-26T21:53:50Z,209.66.200.61,A '''geosynchronous orbit''' is a geocentric [...,64,0
68,Geosynchronous orbit,12944311,2005-04-28T21:29:46Z,209.42.180.197,A '''geosynchronous orbit''' is a geocentric [...,65,0


In [11]:
light_dump = ''
for df in dframes:
    title = df.title[0]
    light_dump = light_dump + title + '\n'
    for idx, row in df.iterrows():
        line = '^^^_' + row.time + ' ' + row.revert + ' ' + str(row.version) + ' ' + row.username
        light_dump = light_dump + line + '\n'



light_dump[:500]

'Emma of Normandy\n^^^_2002-02-21T15:11:49Z 0 1 David Parker\n^^^_2002-02-21T15:14:16Z 0 2 Vicki Rosenzweig\n^^^_2002-02-21T16:06:19Z 0 3 David Parker\n^^^_2002-02-24T08:19:24Z 0 4 David Parker\n^^^_2002-02-25T15:51:15Z 0 5 David Parker\n^^^_2002-03-22T11:35:18Z 0 6 Enchanter\n^^^_2002-06-20T19:28:35Z 0 7 Zoe\n^^^_2003-03-11T06:21:04Z 0 8 203.102.233.212\n^^^_2004-03-30T13:19:17Z 0 9 62.103.214.152\n^^^_2004-07-19T22:42:16Z 0 10 Muriel Gottrop~enwiki\n^^^_2004-08-01T00:06:38Z 0 11 Henrygb\n^^^_2004-08-19T16:'

In [13]:
import os
outpath = 'data/unzipped'
if not os.path.exists(outpath):
    os.makedirs(outpath)
for df in dframes:
    title = df.title[0]
    light_dump = light_dump + title + '\n'
    for idx, row in df.iterrows():
        line = '^^^_' + row.time + ' ' + row.revert + ' ' + str(row.version) + ' ' + row.username
        light_dump = light_dump + line + '\n'
outfile = os.path.join(outpath, 'light_dump.txt')
with open(outfile, 'w') as f:
    f.write(light_dump)
repo = 'XML Converted to light dump at ' + outfile
print(repo)

XML Converted to light dump at data/unzipped/light_dump.txt


In [10]:
light_dump = []
naruto = False
with open("../data/raw/testdata/en_wiki_test.txt", encoding = 'utf8') as file:
    for line in file:
#         print(line)
        if (line[0]!= "^") & naruto:
            break
        
        if line.strip() == 'Emma_of_Normandy':
            naruto = True
            continue
            
        if naruto:
            light_dump.append(line)
            
light_dump = [x[4:-1].split(' ') for x in light_dump]
light_dump.reverse()

In [11]:
light_dump[:5]

[['2002-02-21T15:11:49Z', '0', '1', 'David_Parker'],
 ['2002-02-21T15:14:16Z', '0', '2', 'Vicki_Rosenzweig'],
 ['2002-02-21T16:06:19Z', '0', '3', 'David_Parker'],
 ['2002-02-24T08:19:24Z', '0', '4', 'David_Parker'],
 ['2002-02-25T15:51:15Z', '0', '5', 'David_Parker']]