In [6]:
import sys
import os
import traceback
import pickle
import requests


sys.path.append("../")

import pandas as pd
import numpy as np

from scripts.wiki import Wiki,Revision

In [7]:
class Wiki:
    '''
    MAIN CLASS TO store all revisions for a wiki along with editors and timestamp.
    '''
    def __init__(self,id,title, revs, all_tokens=[]):
        self.id = id
        self.title = title
        self.revisions = revs
        self.add_all_token(all_tokens)
        

           
    def add_all_token(self, all_tokens):
        
        for token in all_tokens:
            self.revisions.loc[token["o_rev_id"]].added.add(token["token_id"])
            for in_revision in token["in"]:
                self.revisions.loc[in_revision].added.add(token["token_id"])
            for out_revision in token["out"]:
                self.revisions.loc[out_revision].removed.add(token["token_id"])
                
    def create_change(self, from_rev_id, to_rev_id, to_rev_content, epsilon_size):
        try:
            from_rev = self.revisions[from_rev_id]
            to_rev = self.revisions[to_rev_id]
            from_rev.deleted(to_rev)
            to_rev.content = to_rev_content
            to_rev.inserted_continuous_pos()
            to_rev.inserted_neighbours()
            from_rev.create_change_object(to_rev)
            from_rev.append_neighbour_vec(to_rev, epsilon_size)
        except:
            print("exception occurred in calculating change object",traceback.format_exc())
            print("problem in ", to_rev_content.keys() )

In [8]:
class Revision:
    def __init__(self, id, timestamp,editor):
        self.id = id
        self.timestamp = timestamp
        self.editor = editor
        self.added = set()
        self.removed = set()   
        
    def deleted(self, to_rev):
        self.content["removed"] = pd.Series(np.isin( self.content["token_id"].values, list(to_rev.removed), assume_unique= True ))
        end_pos = np.argwhere(np.ediff1d(np.pad(self.content["removed"].astype(np.int), (1,1), mode="constant", constant_values=0)) == -1) -1 
        start_pos = np.argwhere(np.ediff1d(np.pad(self.content["removed"].astype(np.int), (1,1), mode="constant", constant_values=0)) == 1)
        start_neighbour = start_pos - 1
        end_neighbour = end_pos + 1
        self.deleted_object = pd.DataFrame(np.c_[ start_pos, end_pos, start_neighbour, end_neighbour ],
                                       columns=[ "del_start_pos", "del_end_pos", "left_neigh", "right_neigh",])
    
    def inserted_continuous_pos(self):
        self.content["added"] = pd.Series(np.isin( self.content["token_id"].values, list(self.added), assume_unique= True))
        end_pos = np.argwhere(np.ediff1d(np.pad(self.content["added"].astype(np.int), (1,1), mode="constant", constant_values=0)) == -1) -1 
        start_pos = np.argwhere(np.ediff1d(np.pad(self.content["added"].astype(np.int), (1,1), mode="constant", constant_values=0)) == 1)
        self.added_pos = np.c_[start_pos, end_pos]

    def inserted_neighbours(self):
        start_token_pos = self.added_pos[:,0] - 1
        end_token_pos = self.added_pos[:,1] + 1
        self.start_token_id = self.content["token_id"].values[start_token_pos]
        self.end_token_id = self.content["token_id"].values[end_token_pos]
    
    def create_change_object(self, to_rev):
        self.ins_left = np.argwhere(np.isin(self.content.token_id.values, to_rev.start_token_id, assume_unique= True))
        self.ins_right = np.argwhere(np.isin(self.content.token_id.values, to_rev.end_token_id, assume_unique= True))
        self.inserted_object = pd.DataFrame(np.concatenate([to_rev.added_pos, self.ins_left, self.ins_right], axis=1),
                                       columns=["ins_start_pos", "ins_end_pos", "left_neigh", "right_neigh" ])

        self.change = pd.merge(self.inserted_object, self.deleted_object,how="outer", on=["left_neigh", "right_neigh"])
        self.change.fillna(-1, inplace=True)
        
    def append_neighbour_vec(self, to_rev, epsilon_size):
        self.content_str_vec = self.content.str.values
        del self.content
        neighbour_df = self.change.apply(find_tokens, axis=1, args=(self, to_rev, epsilon_size))
        neighbour_df.columns= ["ins_tokens", "del_tokens", "left_neigh_slice", "right_neigh_slice", "left_token", "right_token"]
        self.change_df = pd.concat([self.change, neighbour_df], sort=False, axis=1)
        



In [9]:
def find_tokens(change, revision, to_rev, epsilon_size):
    start_left = (int(change["left_neigh"]) - epsilon_size)
    if start_left <0:
        start_left = 0
    left_neigh = slice( start_left, int(change["left_neigh"]) + 1)
    
    end_right = (int(change["right_neigh"]) + epsilon_size+1)
    if end_right >= revision.content_str_vec.size:
        end_right = revision.content_str_vec.size - 1
    right_neigh = slice(int(change["right_neigh"]), end_right )
    if(change["ins_start_pos"]==-1):
        ins_tokens = []
    else:
        ins_slice = slice(int(change["ins_start_pos"]), int(change["ins_end_pos"]+1) )
        ins_tokens = to_rev.content.str.values[ins_slice]
    if(change["del_start_pos"] == -1):
        del_tokens = []
    else:
        del_slice = slice(int(change["del_start_pos"]), int(change["del_end_pos"]+1) )
        del_tokens = revision.content_str_vec[del_slice]
    left_token = revision.content_str_vec[left_neigh]
    right_token = revision.content_str_vec[right_neigh]
    return pd.Series([tuple(ins_tokens), tuple(del_tokens), left_neigh, right_neigh, tuple(left_token), tuple(right_token)])

In [11]:
baseurl = "https://api.wikiwho.net/en/api/v1.0.0-beta/"
article_name = "Violence_against_Muslims_in_India"
filename = article_name + ".h5"
content_dir = "../data/content/"
change_object_dir =  "../data/change objects/"
filepath = os.path.join(content_dir, filename)

epsilon_size = 30
epsilon_size

30

In [None]:
%%time
with pd.HDFStore(filepath, 'r') as store:
    #retrieving all rev list and change object from file
    rev_list = store.get("rev_list")
    all_rev = store.get("all_tokens")
    all_tokens = all_rev.to_dict(orient="records")
    #making revision objects
    revs = rev_list.apply(lambda rev: Revision(rev["id"],rev["timestamp"], rev["editor"]),axis=1)
    revs.index = rev_list.id
    from_rev_id = revs.index[0]
    
    wiki = Wiki(2345, content, revs, all_tokens)
    wiki.revisions.iloc[0].content = store["r"+str(from_rev_id)] 
    for to_rev_id in list(revs.index[1:]):
        key="r"+str(to_rev_id)
        to_rev_content = store[key]
        wiki
        wiki.create_change(from_rev_id, to_rev_id, to_rev_content, epsilon_size)
        from_rev_id = to_rev_id

In [62]:
save_filepath = os.path.join(change_object_dir, content+".pkl")
with open(save_filepath, "wb") as file:
    pickle.dump(wiki, file)

### saving change object for all the articles in the list

In [12]:
def create_change_object(article_name, content_dir = "../data/content/", 
                            change_object_dir =  "../data/change objects/", epsilon_size=30, save=False):
    
    content_filepath = os.path.join(content_dir, article_name+".h5")
    change_object_filepath = os.path.join(change_object_dir, article_name+".pkl")
    
    with pd.HDFStore(content_filepath, 'r') as store:
        #retrieving all rev list and change object from file
        rev_list = store.get("rev_list")
        all_rev = store.get("all_tokens")
        all_tokens = all_rev.to_dict(orient="records")
        
        #making revision objects
        revs = rev_list.apply(lambda rev: Revision(rev["id"],rev["timestamp"], rev["editor"]),axis=1)
        revs.index = rev_list.id
        
        # Getting first revision object and adding content ot it
        from_rev_id = revs.index[0]
        wiki = Wiki(2345, article_name, revs, all_tokens)
        wiki.revisions.iloc[0].content = store["r"+str(from_rev_id)] 
        # adding content to all other revision and finding change object between them.
        
        for to_rev_id in list(revs.index[1:]):
            key="r"+str(to_rev_id)
            to_rev_content = store[key]
            wiki.create_change(from_rev_id, to_rev_id, to_rev_content, epsilon_size)
            from_rev_id = to_rev_id
         
    if save:
        with open(change_object_filepath, "wb") as file:
            pickle.dump(wiki, file)
        
    return wiki

    

In [6]:
article_series=pd.read_csv("../conflicted_article.csv")["articles"]

In [13]:
%%time
wiki = create_change_object(article_name, save=False)

CPU times: user 54 s, sys: 10.2 s, total: 1min 4s
Wall time: 2min 55s


In [None]:
# for article in article_series[19:]:
#     print(article)
#     create_change_object(article)


### Saving change_object as dataframe

In [14]:
%%time
change_objects = []
wiki.revisions.iloc[:-1].apply(lambda revision: change_objects.append(revision.change_df))
# change_index = [ rev.id for rev in  wiki.revisions[1:].tolist()]
# change_df = pd.concat(change_objects, sort=False, keys=change_index, axis=)


CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 4.68 ms


In [15]:
timestamp_s = pd.to_datetime([ rev.timestamp for rev in  wiki.revisions.values.ravel().tolist()])
time_gap = pd.to_timedelta(timestamp_s[1:]-timestamp_s[:-1])

rev_ids = [ rev.id for rev in  wiki.revisions.tolist()]
from_rev_ids = rev_ids[:-1]
to_rev_ids= rev_ids[1:]

editor_s = [ rev.editor for rev in  wiki.revisions.tolist()]

index = list(zip(*[from_rev_ids, to_rev_ids, timestamp_s.tolist()[1:], time_gap, editor_s[1:]]))
change_df = pd.concat(change_objects, sort=False, keys=index, names=["from revision id", "to revision id", "timestamp", "timegap", "editor"])

In [16]:
%%time
change_object_dir =  "../data/change objects/"
change_dataframe_path = os.path.join(change_object_dir, article_name+"_change.pkl")
a=change_df.to_pickle(change_dataframe_path)

469 ms ± 199 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
%%time
change_object_dir =  "../data/change objects/"
change_dataframe_path = os.path.join(change_object_dir, article_name+"_change.pkl")
a=pd.read_pickle(change_dataframe_path)

In [17]:
%%time
change_dataframe_path = os.path.join(change_object_dir, article_name+"_change.h5")
change_df.to_hdf(change_dataframe_path, key="data")

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block2_values] [items->['ins_tokens', 'del_tokens', 'left_neigh_slice', 'right_neigh_slice', 'left_token', 'right_token']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)


223 ms ± 103 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
%%timeit
change_dataframe_path = os.path.join(change_object_dir, article_name+"_change.h5")


a=pd.read_hdf(change_dataframe_path, key="data")

### Test

In [None]:
def tokens_to_df(tokens):
    tokens.insert(0, {'token_id':-1, 'str':  "{st@rt}"})
    tokens.append({'token_id':-2, 'str': "{$nd}"})
    return pd.DataFrame(tokens)

In [None]:
all_tokens = [{'o_rev_id': 558137654,
  'str': 'contemporary',
  'token_id': 994,
  'in': [561887510],
  'out': [561887480]},
 {'o_rev_id': 558137654,
  'str': 'india',
  'token_id': 995,
  'in': [561887510,561887510],
  'out': [561887480]},
 {'o_rev_id': 561887480, 'str': ':', 'token_id': 996, 'in': [561887510], 'out': [561887490]},
 {'o_rev_id': 561887480, 'str': '|', 'token_id': 8976, 'in': [561887510], 'out': [561887490]},
 {'o_rev_id': 558137654,
  'str': 'hefner',
  'token_id': 9876,
  'in': [],
  'out': []},
 {'o_rev_id': 558137654, 'str': '_', 'token_id': 1023, 'in': [561887510], 'out': [561887480]}]

In [None]:
rev_list = pd.DataFrame({'id': [558137654, 561887480,561887490,561887510],
 'editor': ['14904681', '14904681','14904681', '14904681'],
 'timestamp': ['2013-06-03T14:57:37Z', '2013-06-03T15:00:42Z', '2013-06-04T15:00:42Z', '2013-06-05T15:00:42Z']})
revs = rev_list.apply(lambda rev: Revision(rev["id"],rev["timestamp"], rev["editor"]),axis=1)
revs.index = rev_list.id

In [None]:
first_tokens = tokens_to_df( [
 {'str': 'contemporary','token_id': 994},
 {'str': 'india','token_id': 995},
 {'str': 'hefner','token_id': 9876},
 {'str': '_', 'token_id': 1023}])
first_tokens["str"][1:-1].str.cat(sep= " ")

In [None]:
second_tokens = tokens_to_df( [
 {'str': ':', 'token_id': 996},
 {'str': '|', 'token_id': 8976},
 {'str': 'hefner','token_id': 9876}
])
second_tokens["str"][1:-1].str.cat(sep= " ")

In [None]:
third_tokens = tokens_to_df( [
 {'str': 'hefner','token_id': 9876}
])
third_tokens["str"][1:-1].str.cat(sep= " ")

In [None]:
fourth_tokens = tokens_to_df( [
 {'str': 'contemporary','token_id': 994},
 {'str': ':', 'token_id': 996},
 {'str': '|', 'token_id': 8976},
 {'str': 'india','token_id': 995},
 {'str': 'hefner','token_id': 9876},
 {'str': '_', 'token_id': 1023}])
fourth_tokens["str"][1:-1].str.cat(sep= " ")

In [None]:
test_wiki = Wiki(1234, "test",revs, all_tokens)

test_wiki.revisions.iloc[0].content = first_tokens


first_rev_id = revs.index[0]
second_rev_id = revs.index[1]

test_wiki.create_change(first_rev_id, second_rev_id, second_tokens, vocab, 6)

third_rev_id = revs.index[2]

test_wiki.create_change(second_rev_id, third_rev_id, third_tokens, vocab, 6)

fourth_rev_id = revs.index[3]

test_wiki.create_change(third_rev_id, fourth_rev_id, fourth_tokens, vocab, 6)

change_objects = []
test_wiki.revisions[:-1].apply(lambda revision: change_objects.append(revision.neighbour))
change_df = pd.concat(change_objects, sort=False, keys=wiki.revisions.index)

In [None]:
change_df