In [1]:
import sys
import os
import traceback
import pickle
import requests


sys.path.append("../")

import pandas as pd
import numpy as np

# from scripts.wiki import Wiki,Revision

In [2]:
class Wiki:
    '''
    MAIN CLASS TO store all revisions for a wiki along with editors and timestamp.
    '''
    def __init__(self,id,title, revs, all_tokens=[]):
        self.id = id
        self.title = title
        self.revisions = revs
        self.add_all_token(all_tokens)
        

           
    def add_all_token(self, all_tokens):
        
        for token in all_tokens:
            self.revisions.loc[token["o_rev_id"]].added.add(token["token_id"])
            for in_revision in token["in"]:
                self.revisions.loc[in_revision].added.add(token["token_id"])
            for out_revision in token["out"]:
                self.revisions.loc[out_revision].removed.add(token["token_id"])
                
    def create_change(self, from_rev_id, to_rev_id, to_rev_content, epsilon_size):
        try:
            from_rev = self.revisions[from_rev_id]
            to_rev = self.revisions[to_rev_id]
            from_rev.deleted(to_rev)
            to_rev.content = to_rev_content
            to_rev.inserted_continuous_pos()
            to_rev.inserted_neighbours()
            from_rev.create_change_object(to_rev)
            from_rev.append_neighbour_vec(to_rev, epsilon_size)
        except:
            print("exception occurred in calculating change object",traceback.format_exc())
            print("problem in ", to_rev_content.keys() )

In [3]:
class Revision:
    def __init__(self, id, timestamp,editor):
        self.id = id
        self.timestamp = timestamp
        self.editor = editor
        self.added = set()
        self.removed = set()   
        
    def deleted(self, to_rev):
        self.content["removed"] = pd.Series(np.isin( self.content["token_id"].values, list(to_rev.removed), assume_unique= True ))
        end_pos = np.argwhere(np.ediff1d(np.pad(self.content["removed"].astype(np.int), (1,1), mode="constant", constant_values=0)) == -1) -1 
        start_pos = np.argwhere(np.ediff1d(np.pad(self.content["removed"].astype(np.int), (1,1), mode="constant", constant_values=0)) == 1)
        start_neighbour = start_pos - 1
        end_neighbour = end_pos + 1
        self.deleted_object = pd.DataFrame(np.c_[ start_pos, end_pos, start_neighbour, end_neighbour ],
                                       columns=[ "del_start_pos", "del_end_pos", "left_neigh", "right_neigh",])
    
    def inserted_continuous_pos(self):
        self.content["added"] = pd.Series(np.isin( self.content["token_id"].values, list(self.added), assume_unique= True))
        end_pos = np.argwhere(np.ediff1d(np.pad(self.content["added"].astype(np.int), (1,1), mode="constant", constant_values=0)) == -1) -1 
        start_pos = np.argwhere(np.ediff1d(np.pad(self.content["added"].astype(np.int), (1,1), mode="constant", constant_values=0)) == 1)
        self.added_pos = np.c_[start_pos, end_pos]

    def inserted_neighbours(self):
        start_token_pos = self.added_pos[:,0] - 1
        end_token_pos = self.added_pos[:,1] + 1
        self.start_token_id = self.content["token_id"].values[start_token_pos]
        self.end_token_id = self.content["token_id"].values[end_token_pos]
    
    def create_change_object(self, to_rev):
        self.ins_left = np.argwhere(np.isin(self.content.token_id.values, to_rev.start_token_id, assume_unique= True))
        self.ins_right = np.argwhere(np.isin(self.content.token_id.values, to_rev.end_token_id, assume_unique= True))
        self.inserted_object = pd.DataFrame(np.concatenate([to_rev.added_pos, self.ins_left, self.ins_right], axis=1),
                                       columns=["ins_start_pos", "ins_end_pos", "left_neigh", "right_neigh" ])

        self.change = pd.merge(self.inserted_object, self.deleted_object,how="outer", on=["left_neigh", "right_neigh"])
        self.change.fillna(-1, inplace=True)
        
    def append_neighbour_vec(self, to_rev, epsilon_size):
        self.wiki_who_tokens = self.content.token_id.values
        del self.content
        neighbour_df = self.change.apply(find_tokens, axis=1, args=(self, to_rev, epsilon_size))
        neighbour_df.columns= ["ins_tokens", "del_tokens", "left_neigh_slice", "right_neigh_slice", "left_token", "right_token"]
        self.change_df = pd.concat([self.change, neighbour_df], sort=False, axis=1)
        



In [4]:
def find_tokens(change, revision, to_rev, epsilon_size):
    start_left = (int(change["left_neigh"]) - epsilon_size)
    if start_left <0:
        start_left = 0
    left_neigh = slice( start_left, int(change["left_neigh"]) + 1)
    
    end_right = (int(change["right_neigh"]) + epsilon_size+1)
    if end_right >= revision.wiki_who_tokens.size:
        end_right = revision.wiki_who_tokens.size - 1
    right_neigh = slice(int(change["right_neigh"]), end_right )
    if(change["ins_start_pos"]==-1):
        ins_tokens = []
    else:
        ins_slice = slice(int(change["ins_start_pos"]), int(change["ins_end_pos"]+1) )
        ins_tokens = to_rev.content.token_id.values[ins_slice]
    if(change["del_start_pos"] == -1):
        del_tokens = []
    else:
        del_slice = slice(int(change["del_start_pos"]), int(change["del_end_pos"]+1) )
        del_tokens = revision.wiki_who_tokens[del_slice]
    left_token = revision.wiki_who_tokens[left_neigh]
    right_token = revision.wiki_who_tokens[right_neigh]
    return pd.Series([tuple(ins_tokens), tuple(del_tokens), left_neigh, right_neigh, tuple(left_token), tuple(right_token)])

In [5]:
baseurl = "https://api.wikiwho.net/en/api/v1.0.0-beta/"
article_name = "John_Logie_Baird"
filename = article_name + ".h5"
content_dir = "../data/content/"
change_object_dir =  "../data/change objects/"
filepath = os.path.join(content_dir, filename)

epsilon_size = 30

In [6]:
len_file = article_name + "_rev_len.h5"
len_file_path = os.path.join(content_dir, len_file)

with pd.HDFStore(filepath, 'r') as store:
    #retrieving all rev list and change object from file
    rev_list = store.get("rev_list")["id"].values.tolist()
    keys = ["r" +  str(rev) for rev in rev_list]
    rev_len_list = [store.get(key).shape[0] for key in keys]
rev_len_df = pd.DataFrame({"rev_id":rev_list[:-1], "length": rev_len_list[:-1]})

rev_len_df.to_hdf(len_file_path, "rev_len")

In [None]:
# %%time
# with pd.HDFStore(filepath, 'r') as store:
#     #retrieving all rev list and change object from file
#     rev_list = store.get("rev_list")
#     all_rev = store.get("all_tokens")
#     all_tokens = all_rev.to_dict(orient="records")
#     #making revision objects
#     revs = rev_list.apply(lambda rev: Revision(rev["id"],rev["timestamp"], rev["editor"]),axis=1)
#     revs.index = rev_list.id
#     from_rev_id = revs.index[0]
    
#     wiki = Wiki(2345, content, revs, all_tokens)
#     wiki.revisions.iloc[0].content = store["r"+str(from_rev_id)] 
#     for to_rev_id in list(revs.index[1:]):
#         key="r"+str(to_rev_id)
#         to_rev_content = store[key]
#         wiki
#         wiki.create_change(from_rev_id, to_rev_id, to_rev_content, epsilon_size)
#         from_rev_id = to_rev_id

In [None]:
# save_filepath = os.path.join(change_object_dir, content+".pkl")
# with open(save_filepath, "wb") as file:
#     pickle.dump(wiki, file)

### saving change object for all the articles in the list

In [7]:
def create_change_object(article_name, content_dir = "../data/content/", 
                            change_object_dir =  "../data/change objects/", epsilon_size=30, save=False):
    
    content_filepath = os.path.join(content_dir, article_name+".h5")
    change_object_filepath = os.path.join(change_object_dir, article_name+".pkl")
    
    with pd.HDFStore(content_filepath, 'r') as store:
        #retrieving all rev list and change object from file
        rev_list = store.get("rev_list")
        all_rev = store.get("all_tokens")
        all_tokens = all_rev.to_dict(orient="records")
        
        #making revision objects
        revs = rev_list.apply(lambda rev: Revision(rev["id"],rev["timestamp"], rev["editor"]),axis=1)
        revs.index = rev_list.id
        
        # Getting first revision object and adding content ot it
        from_rev_id = revs.index[0]
        wiki = Wiki(2345, article_name, revs, all_tokens)
        wiki.revisions.iloc[0].content = store["r"+str(from_rev_id)] 
        # adding content to all other revision and finding change object between them.
        
        for to_rev_id in list(revs.index[1:]):
            key="r"+str(to_rev_id)
            to_rev_content = store[key]
            wiki.create_change(from_rev_id, to_rev_id, to_rev_content, epsilon_size)
            from_rev_id = to_rev_id
         
    if save:
        with open(change_object_filepath, "wb") as file:
            pickle.dump(wiki, file)
        
    return wiki

    

In [None]:
article_series=pd.read_csv("../conflicted_article.csv")["articles"]

In [8]:
%%time
wiki = create_change_object(article_name, save=False)

CPU times: user 59.4 s, sys: 4.95 s, total: 1min 4s
Wall time: 1min 23s


In [None]:
# for article in article_series[19:]:
#     print(article)
#     create_change_object(article)


### Saving change_object as dataframe

In [9]:
change_objects = []
wiki.revisions.iloc[:-1].apply(lambda revision: change_objects.append(revision.change_df))
# change_index = [ rev.id for rev in  wiki.revisions[1:].tolist()]
# change_df = pd.concat(change_objects, sort=False, keys=change_index, axis=)


timestamp_s = pd.to_datetime([ rev.timestamp for rev in  wiki.revisions.values.ravel().tolist()])
time_gap = pd.to_timedelta(timestamp_s[1:]-timestamp_s[:-1])

rev_ids = [ rev.id for rev in  wiki.revisions.tolist()]
from_rev_ids = rev_ids[:-1]
to_rev_ids= rev_ids[1:]

editor_s = [ rev.editor for rev in  wiki.revisions.tolist()]

index = list(zip(*[from_rev_ids, to_rev_ids, timestamp_s.tolist()[1:], time_gap, editor_s[1:]]))
change_df = pd.concat(change_objects, sort=False, keys=index, names=["from revision id", "to revision id", "timestamp", "timegap", "editor"])

In [10]:
# %%time
# change_object_dir =  "../data/change objects/"
# change_dataframe_path = os.path.join(change_object_dir, article_name+"_change.pkl")
# a=change_df.to_pickle(change_dataframe_path)

In [11]:
# %%time
# change_object_dir =  "../data/change objects/"
# change_dataframe_path = os.path.join(change_object_dir, article_name+"_change.pkl")
# a=pd.read_pickle(change_dataframe_path)

In [12]:
change_dataframe_path = os.path.join(change_object_dir, article_name+"_change.h5")
change_df.to_hdf(change_dataframe_path, key="data")

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block2_values] [items->['ins_tokens', 'del_tokens', 'left_neigh_slice', 'right_neigh_slice', 'left_token', 'right_token']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)


In [13]:
change_df.head(50)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,ins_start_pos,ins_end_pos,left_neigh,right_neigh,del_start_pos,del_end_pos,ins_tokens,del_tokens,left_neigh_slice,right_neigh_slice,left_token,right_token
from revision id,to revision id,timestamp,timegap,editor,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
203693,203699,2002-09-08 14:05:32+00:00,194 days 22:14:17,3646,0,10.0,30.0,9,10,-1.0,-1.0,"(41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 5...",(),"slice(0, 10, None)","slice(10, 41, None)","(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8)","(9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20..."
203693,203699,2002-09-08 14:05:32+00:00,194 days 22:14:17,3646,1,32.0,32.0,10,11,-1.0,-1.0,"(62,)",(),"slice(0, 11, None)","slice(11, 42, None)","(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9)","(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 2..."
203693,203699,2002-09-08 14:05:32+00:00,194 days 22:14:17,3646,2,34.0,34.0,11,12,-1.0,-1.0,"(63,)",(),"slice(0, 12, None)","slice(12, 42, None)","(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)","(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 2..."
203693,203699,2002-09-08 14:05:32+00:00,194 days 22:14:17,3646,3,50.0,55.0,26,32,27.0,31.0,"(64, 65, 66, 67, 68, 69)","(26, 27, 28, 29, 30)","slice(0, 27, None)","slice(32, 42, None)","(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,...","(31, 32, 33, 34, 35, 36, 37, 38, 39, 40)"
203693,203699,2002-09-08 14:05:32+00:00,194 days 22:14:17,3646,4,57.0,73.0,32,36,33.0,35.0,"(70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","(32, 33, 34)","slice(2, 33, None)","slice(36, 42, None)","(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","(35, 36, 37, 38, 39, 40)"
203693,203699,2002-09-08 14:05:32+00:00,194 days 22:14:17,3646,5,-1.0,-1.0,38,42,39.0,41.0,(),"(38, 39, 40)","slice(8, 39, None)","slice(42, 42, None)","(7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, ...",()
203699,368628,2002-09-08 14:09:31+00:00,0 days 00:03:59,3646,0,15.0,15.0,14,16,15.0,15.0,"(87,)","(46,)","slice(0, 15, None)","slice(16, 47, None)","(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 41, 42, 43, 44...","(47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 5..."
368628,590892,2002-10-19 09:44:19+00:00,40 days 19:34:48,0|217.168.172.202,0,65.0,66.0,64,65,-1.0,-1.0,"(88, 89)",(),"slice(34, 65, None)","slice(65, 77, None)","(63, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 2...","(78, 79, 80, 81, 82, 83, 84, 85, 86, 35, 36, 37)"
368628,590892,2002-10-19 09:44:19+00:00,40 days 19:34:48,0|217.168.172.202,1,71.0,72.0,68,69,-1.0,-1.0,"(90, 91)",(),"slice(38, 69, None)","slice(69, 77, None)","(14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 2...","(82, 83, 84, 85, 86, 35, 36, 37)"
590892,638549,2003-01-16 13:19:38+00:00,89 days 03:35:19,3295,0,59.0,59.0,58,60,59.0,59.0,"(92,)","(72,)","slice(28, 59, None)","slice(60, 81, None)","(59, 60, 61, 9, 62, 10, 63, 11, 12, 13, 14, 15...","(73, 74, 75, 76, 77, 88, 89, 78, 79, 80, 81, 9..."
