In [3]:
from gensim.models import KeyedVectors
from gensim.models.fasttext import FastText as FT_gensim
import pandas as pd
import numpy as np
import requests
import pickle
import os
import traceback
from scripts.wiki import Wiki,Revision

In [133]:
# FT_gensim.load('../wordvectors/wiki-news-300d-1M-subword.vec.zip')
wiki_vec = KeyedVectors.load_word2vec_format('../wordvectors/wiki.en.vec', binary=False, limit=5000)
vocab = set(wiki_vec.vocab)

In [311]:
class Wiki:
    '''
    MAIN CLASS TO store all revisions for a wiki along with editors and timestamp.
    '''
    def __init__(self,id,title, revs, all_tokens=[]):
        self.id = id
        self.title = title
        self.revisions = revs
        self.add_all_token(all_tokens)
        
#     def init_revisions(self, revisions):
#           self.revisions = pd.Series( {revision["id"] : 
#                                        Revision(revision["id"],revision["timestamp"], revision["editor"]) for revision in revisions} )

 
           
    def add_all_token(self, all_tokens):
        for token in all_tokens:
            self.revisions.loc[token["o_rev_id"]].added.add(token["token_id"])
            for in_revision in token["in"]:
                self.revisions.loc[in_revision].added.add(token["token_id"])
            for out_revision in token["out"]:
                self.revisions.loc[out_revision].removed.add(token["token_id"])
                
    def create_change(self, from_rev_id, to_rev_id, to_rev_content, vocab, epsilon_size):
        try:
            from_rev = self.revisions[from_rev_id]
            to_rev = self.revisions[to_rev_id]
            from_rev.deleted(to_rev)
            from_rev.content["invocab"] = from_rev.content["str"].isin(vocab)
            to_rev.content = to_rev_content
            to_rev.inserted_continuous_pos()
            to_rev.inserted_neighbours()
            from_rev.create_change_object(to_rev)
            from_rev.append_neighbour_vec(to_rev, epsilon_size)
        except:
            print("exception occurred in calculating change object",traceback.format_exc())
            print("problem in ", to_rev_content.keys() )

In [312]:
class Revision:
    def __init__(self, id, timestamp,editor):
        self.id = id
        self.timestamp = timestamp
        self.editor = editor
        self.added = set()
        self.removed = set()   
        
    def deleted(self, to_rev):
        self.content["removed"] = pd.Series(np.isin( self.content["token_id"].values, list(to_rev.removed), assume_unique= True ))
        end_pos = np.argwhere(np.ediff1d(np.pad(self.content["removed"].astype(np.int), (1,1), mode="constant", constant_values=0)) == -1) -1 
        start_pos = np.argwhere(np.ediff1d(np.pad(self.content["removed"].astype(np.int), (1,1), mode="constant", constant_values=0)) == 1)
        start_neighbour = start_pos - 1
        end_neighbour = end_pos + 1
        self.deleted_object = pd.DataFrame(np.c_[ start_pos, end_pos, start_neighbour, end_neighbour ],
                                       columns=[ "del_start_pos", "del_end_pos", "left_neigh", "right_neigh",])
    
    def inserted_continuous_pos(self):
        self.content["added"] = pd.Series(np.isin( self.content["token_id"].values, list(self.added), assume_unique= True))
        end_pos = np.argwhere(np.ediff1d(np.pad(self.content["added"].astype(np.int), (1,1), mode="constant", constant_values=0)) == -1) -1 
        start_pos = np.argwhere(np.ediff1d(np.pad(self.content["added"].astype(np.int), (1,1), mode="constant", constant_values=0)) == 1)
        self.added_pos = np.c_[start_pos, end_pos]

    def inserted_neighbours(self):
        start_token_pos = self.added_pos[:,0] - 1
        end_token_pos = self.added_pos[:,1] + 1
        self.start_token_id = self.content["token_id"].values[start_token_pos]
        self.end_token_id = self.content["token_id"].values[end_token_pos]
    
    def create_change_object(self, to_rev):
        self.ins_left = np.argwhere(np.isin(self.content.token_id.values, to_rev.start_token_id, assume_unique= True))
        self.ins_right = np.argwhere(np.isin(self.content.token_id.values, to_rev.end_token_id, assume_unique= True))
        self.inserted_object = pd.DataFrame(np.concatenate([to_rev.added_pos, self.ins_left, self.ins_right], axis=1),
                                       columns=["ins_start_pos", "ins_end_pos", "left_neigh", "right_neigh", ])

        self.change = pd.merge(self.inserted_object, self.deleted_object,how="outer", on=["left_neigh", "right_neigh"])
        self.change.fillna(-1, inplace=True)
        
    def append_neighbour_vec(self, to_rev, epsilon_size):
        self.vocabs_pos = np.argwhere( self.content["invocab"].values)
        self.content_str_vec = self.content.str.values
        del self.content
        neighbour_df = self.change.apply(find_tokens, axis=1, args=(self,to_rev, epsilon_size))
        neighbour_df.columns= ["ins_tokens", "del_tokens", "left_neigh", "right_neigh", "left_token", "right_token"]
        self.neighbour = neighbour_df
        self.change_df = pd.concat([self.change, neighbour_df], sort=False, axis=1)
        



In [313]:
def find_tokens(change, revision, to_rev, epsilon_size):
    left_neigh = revision.vocabs_pos[revision.vocabs_pos <= change["left_neigh"]][-epsilon_size:]
    right_neigh = revision.vocabs_pos[revision.vocabs_pos >= change["right_neigh"]][:epsilon_size]
    if(change["ins_start_pos"]==-1):
        ins_tokens = []
    else:
        ins_slice = slice(int(change["ins_start_pos"]), int(change["ins_end_pos"]+1) )
        ins_tokens = to_rev.content.str.values[ins_slice]
    if(change["del_start_pos"] == -1):
        del_tokens = []
    else:
        del_slice = slice(int(change["del_start_pos"]), int(change["del_end_pos"]+1) )
        del_tokens = revision.content_str_vec[del_slice]
    left_token = revision.content_str_vec[left_neigh]
    right_token = revision.content_str_vec[right_neigh]
    return pd.Series([tuple(ins_tokens), tuple(del_tokens), tuple(left_neigh), tuple(right_neigh), tuple(left_token), tuple(right_token)])

In [4]:
baseurl = "https://api.wikiwho.net/en/api/v1.0.0-beta/"
content = "Yugoslavia"
filename = content + ".h5"
save_dir = "../data/content"
save_path = os.path.join(save_dir, filename)

epsilon_size = 6


In [18]:
all_content_url = os.path.join(baseurl, "all_content", content +"/")
params = { "o_rev_id": "true", "editor": "false", "token_id": "true", "in": "true", "out": "true" }
all_rev_data = requests.get(all_content_url, params= params)
all_tokens = all_rev_data.json()["all_tokens"]
all_df = pd.DataFrame(all_tokens)[["str","token_id"]].set_index("token_id")

In [137]:
%%time
with pd.HDFStore(filename, 'r') as store:
    rev_list = store.get("rev_list")
    revs = rev_list.apply(lambda rev: Revision(rev["id"],rev["timestamp"], rev["editor"]),axis=1)
    revs.index = rev_list.id
    from_rev_id = revs.index[0]
    wiki = Wiki(2345, content, revs, all_tokens)
    wiki.revisions.iloc[0].content = store["r"+str(from_rev_id)]   
    for to_rev_id in list(revs.index[1:]):
        key="r"+str(to_rev_id)
        to_rev_content = store[key]
        wiki.create_change(from_rev_id, to_rev_id, to_rev_content, vocab, epsilon_size)
        from_rev_id = to_rev_id

CPU times: user 34.6 s, sys: 1.37 s, total: 36 s
Wall time: 36 s


In [117]:
with open(content+".pkl", "wb") as file:
    pickle.dump(wiki, file)

### reading the change object and clustering.

In [118]:
with open(content+".pkl", "rb") as file:
    wiki = pickle.load(file)

In [138]:
change_objects = []
wiki.revisions[:-1].apply(lambda revision: change_objects.append(revision.neighbour))
change_df = pd.concat(change_objects, sort=False, keys=wiki.revisions.index)

In [132]:
def get_word_vecs(tokens):
    in_vocab_tokens = set(tokens) & set(wiki_vec.vocab)
    if in_vocab_tokens:
        return wiki_vec[in_vocab_tokens].sum(axis=0, keepdims=True)
    else:
        return np.zeros((1, wiki_vec.vector_size))

In [139]:
%%time
change_vecs_list = []
change_token_s = change_df["ins_tokens"] + change_df["del_tokens"]
change_token_s.apply(lambda token_set: change_vecs_list.append(get_word_vecs(token_set)))

change_matrix = np.concatenate(change_vecs_list, axis=0)


CPU times: user 2.47 s, sys: 0 ns, total: 2.47 s
Wall time: 2.47 s


In [85]:
%%time
neigh_vecs_list = []
neighbour_s = change_df['left_token'] + change_df['right_token']
neighbour_s.apply(lambda token_set: neigh_vecs_list.append(get_word_vecs(token_set)))

neighbour_matrix = np.concatenate(neigh_vecs_list, axis=0)

CPU times: user 1 s, sys: 8 ms, total: 1.01 s
Wall time: 1.01 s


### Do clustering

In [None]:
from sklearn.cluster import KMeans

In [None]:
km = KMeans(n_clusters= 30)
clusters = km.fit(neighbour_matrix)

In [None]:
cluster_s = pd.Series(clusters.labels_, index= change_df.index)
change_df["cluster"] = cluster_s
change_grouped = change_df.groupby("cluster")

In [None]:
change_grouped.size()

In [None]:
change_grouped.get_group(1)[["ins_tokens", "del_tokens", "left_neigh", "right_neigh"]].head(20)

### Test

In [402]:
def tokens_to_df(tokens):
    tokens.insert(0, {'token_id':-1, 'str':  "{st@rt}"})
    tokens.append({'token_id':-2, 'str': "{$nd}"})
    return pd.DataFrame(tokens)

In [400]:
all_tokens = [{'o_rev_id': 558137654,
  'str': 'contemporary',
  'token_id': 994,
  'in': [561887510],
  'out': [561887480]},
 {'o_rev_id': 558137654,
  'str': 'india',
  'token_id': 995,
  'in': [561887510,561887510],
  'out': [561887480]},
 {'o_rev_id': 561887480, 'str': ':', 'token_id': 996, 'in': [561887510], 'out': [561887490]},
 {'o_rev_id': 561887480, 'str': '|', 'token_id': 8976, 'in': [561887510], 'out': [561887490]},
 {'o_rev_id': 558137654,
  'str': 'hefner',
  'token_id': 9876,
  'in': [],
  'out': []},
 {'o_rev_id': 558137654, 'str': '_', 'token_id': 1023, 'in': [561887510], 'out': [561887480]}]

In [401]:
rev_list = pd.DataFrame({'id': [558137654, 561887480,561887490,561887510],
 'editor': ['14904681', '14904681','14904681', '14904681'],
 'timestamp': ['2013-06-03T14:57:37Z', '2013-06-03T15:00:42Z', '2013-06-04T15:00:42Z', '2013-06-05T15:00:42Z']})
revs = rev_list.apply(lambda rev: Revision(rev["id"],rev["timestamp"], rev["editor"]),axis=1)
revs.index = rev_list.id

In [403]:
first_tokens = tokens_to_df( [
 {'str': 'contemporary','token_id': 994},
 {'str': 'india','token_id': 995},
 {'str': 'hefner','token_id': 9876},
 {'str': '_', 'token_id': 1023}])
first_tokens["str"][1:-1].str.cat(sep= " ")

'contemporary india hefner _'

In [404]:
second_tokens = tokens_to_df( [
 {'str': ':', 'token_id': 996},
 {'str': '|', 'token_id': 8976},
 {'str': 'hefner','token_id': 9876}
])
second_tokens["str"][1:-1].str.cat(sep= " ")

': | hefner'

In [414]:
third_tokens = tokens_to_df( [
 {'str': 'hefner','token_id': 9876}
])
third_tokens["str"][1:-1].str.cat(sep= " ")

'hefner'

In [415]:
fourth_tokens = tokens_to_df( [
 {'str': 'contemporary','token_id': 994},
 {'str': ':', 'token_id': 996},
 {'str': '|', 'token_id': 8976},
 {'str': 'india','token_id': 995},
 {'str': 'hefner','token_id': 9876},
 {'str': '_', 'token_id': 1023}])
fourth_tokens["str"][1:-1].str.cat(sep= " ")

'contemporary : | india hefner _'

In [412]:
test_wiki = Wiki(1234, "test",revs, all_tokens)

test_wiki.revisions.iloc[0].content = first_tokens


first_rev_id = revs.index[0]
second_rev_id = revs.index[1]

test_wiki.create_change(first_rev_id, second_rev_id, second_tokens, vocab, 6)

third_rev_id = revs.index[2]

test_wiki.create_change(second_rev_id, third_rev_id, third_tokens, vocab, 6)

fourth_rev_id = revs.index[3]

test_wiki.create_change(third_rev_id, fourth_rev_id, fourth_tokens, vocab, 6)

change_objects = []
test_wiki.revisions[:-1].apply(lambda revision: change_objects.append(revision.neighbour))
change_df = pd.concat(change_objects, sort=False, keys=wiki.revisions.index)

In [413]:
change_df

Unnamed: 0_level_0,Unnamed: 1_level_0,ins_tokens,del_tokens,left_neigh,right_neigh,left_token,right_token
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
558137654,0,"(:, |)","(contemporary, india)",(),"(4,)",(),"(_,)"
558137654,1,(),"(_,)","(1, 2)",(),"(contemporary, india)",()
558137760,0,(),"(:, |)",(),(),(),()
561887480,0,"(contemporary, :, |, india)",(),(),(),(),()
561887480,1,"(_,)",(),(),(),(),()
