In [2]:
import gensim
from gensim.models import KeyedVectors
from gensim.models.fasttext import FastText as FT_gensim
import pandas as pd
import numpy as np
import requests
import pickle
import os

In [3]:
# FT_gensim.load('../wordvectors/wiki-news-300d-1M-subword.vec.zip')
wiki_vec = KeyedVectors.load_word2vec_format('../wordvectors/wiki.en.vec', binary=False, limit=5000)
vocab = set(wiki_vec.vocab)

In [4]:
class Wiki:
    def __init__(self,id,title, all_tokens=[]):
        self.id = id
        self.title = title
        self.all_tokens = all_tokens
        
    def init_revisions(self, revisions):
#         self.revisions = {revision["id"] : Revision(revision["id"],revision["timestamp"], revision["editor"]) for revision in revisions}
          self.revisions = pd.Series( {revision["id"] : 
                                       Revision(revision["id"],revision["timestamp"], revision["editor"]) for revision in revisions} )

 
    def add_all_token(self, all_tokens):
        self.all_tokens = all_tokens
        for token in self.all_tokens:
            self.revisions.loc[token["o_rev_id"]].added.add(token["token_id"])
            for in_revision in token["in"]:
                self.revisions.loc[in_revision].added.add(token["token_id"])
            for out_revision in token["out"]:
                self.revisions.loc[out_revision].removed.add(token["token_id"])
class Revision:
    def __init__(self, id, timestamp,editor):
        self.id = id
        self.timestamp = timestamp
        self.editor = editor
        self.added = set()
        self.removed = set()   
        
    def deleted(self, to_rev):
        self.content["removed"] = pd.Series(np.isin( self.content["token_id"].values, list(t_rev.removed), assume_unique= True ))
        end_pos = np.argwhere(np.ediff1d(np.pad(self.content["removed"].astype(np.int), (1,1), mode="constant", constant_values=0)) == -1) -1 
        start_pos = np.argwhere(np.ediff1d(np.pad(self.content["removed"].astype(np.int), (1,1), mode="constant", constant_values=0)) == 1)
        start_neighbour = start_pos - 1
        end_neighbour = end_pos + 1
#         self.first_last_token_del = self.content["removed"].values[[0,-1]]
#         if self.first_last_token_del[0]:
#             start_pos[0] = end_pos[0]
#             start_neighbour[0] = end_neighbour[0]
#         if self.first_last_token_del[1]:
#             end_pos[-1] = start_pos[-1]
#             start_neighbour[-1] = end_neighbour[-1]
        self.deleted_object = pd.DataFrame(np.c_[ start_pos, end_pos, start_neighbour, end_neighbour ],
                                       columns=[ "del_start_pos", "del_end_pos", "left_neigh", "right_neigh",])
    
    def inserted_continuous_pos(self):
        self.content["added"] =  pd.Series(np.isin( self.content["token_id"].values, list(self.added), assume_unique= True))
        end_pos = np.argwhere(np.ediff1d(np.pad(self.content["added"].astype(np.int), (1,1), mode="constant", constant_values=0)) == -1) -1 
        start_pos = np.argwhere(np.ediff1d(np.pad(self.content["added"].astype(np.int), (1,1), mode="constant", constant_values=0)) == 1)
        self.added_pos = np.c_[start_pos, end_pos]

    def inserted_neighbours(self):
        start_token_pos = self.added_pos[:,0] - 1
        end_token_pos = self.added_pos[:,1] + 1
#         self.first_last_token = self.content["added"].values[[0,-1]]
#         if self.first_last_token[0]:
#             start_token_pos[0] = end_token_pos[0]
#         if self.first_last_token[1]:
#             end_token_pos[-1] = start_token_pos[-1]
        self.start_token_id = self.content["token_id"].values[start_token_pos]
        self.end_token_id = self.content["token_id"].values[end_token_pos]
    
    def create_change_object(self, to_rev):
        self.ins_left = np.argwhere(np.isin(self.content.token_id.values, to_rev.start_token_id, assume_unique= True))
        self.ins_right = np.argwhere(np.isin(self.content.token_id.values, to_rev.end_token_id, assume_unique= True))
        self.inserted_object = pd.DataFrame(np.concatenate([to_rev.added_pos, self.ins_left, self.ins_right], axis=1),
                                       columns=["ins_start_pos", "ins_end_pos", "left_neigh", "right_neigh", ])

        self.change = pd.merge(self.inserted_object, self.deleted_object,how="outer", on=["left_neigh", "right_neigh"])
        self.change.fillna(0, inplace=True)
        
#     def find_tokens(self, epsilon_size):
#         self.vocabs_pos = np.argwhere( self.content["invocab"].values)
#         self.content_str_vec = self.content.str.values
#         del self.content


#         return pd.Series([tuple(ins_tokens), tuple(del_tokens), tuple(left_neigh), tuple(right_neigh), tuple(left_token), tuple(right_token)])
    
#     def find_neighbour_tokens(self, change_neigh, epsilon_size):
#         left_neigh_start = change_neigh[0]
#         right_neigh_start = change_neigh[1]
#         left_neigh = self.vocabs_pos[self.vocabs_pos <= left_neigh_start][-epsilon_size:]
#         right_neigh = self.vocabs_pos[self.vocabs_pos >= right_neigh_start][:epsilon_size]
#         left_tokens = tuple(self.content_str_vec[left_neigh])
#         right_tokens = tuple(self.content_str_vec[right_neigh])
#         return np.c_[left_tokens, right_tokens]
        
#     def find_changed_tokens(self, change_pos, epsilon_size):
#         ins_start_pos = change_pos[0]
#         ins_end_pos = change_pos[1]
#         del_start_pos = change_pos[0]
#         del_end_pos = change_pos[1]
#         ins_slice = slice(int(ins_start_pos), int(ins_end_pos)+1 )
#         del_slice = slice(int(del_start_pos), int(del_end_pos)+1 )
#         ins_tokens = tuple(self.content_str_vec[ins_slice])
#         del_tokens = tuple(self.content_str_vec[del_slice])
#         return np.c_[[ins_tokens], [del_tokens]]
    
#     def find_tokens(change, revision, epsilon_size):
#         left_neigh = revision.vocabs_pos[revision.vocabs_pos <= change["left_neigh"]][-epsilon_size:]
#         right_neigh = revision.vocabs_pos[revision.vocabs_pos >= change["right_neigh"]][:epsilon_size]
#         ins_slice = slice(int(change["ins_start_pos"]), int(change["ins_end_pos"]+1) )
#         del_slice = slice(int(change["del_start_pos"]), int(change["del_end_pos"]+1) )
#         left_token = revision.content_str_vec[left_neigh]
#         right_token = revision.content_str_vec[right_neigh]
#         ins_tokens = revision.content_str_vec[ins_slice]
#         del_tokens = revision.content_str_vec[del_slice]
#         return pd.Series([tuple(ins_tokens), tuple(del_tokens), tuple(left_neigh), tuple(right_neigh), tuple(left_token), tuple(right_token)])
        
    def append_neighbour_vec(self, epsilon_size):
        self.vocabs_pos = np.argwhere( self.content["invocab"].values)
        self.content_str_vec = self.content.str.values
#         del self.content
        neighbour_df = self.change.apply(find_tokens, axis=1, args=(self, epsilon_size))
        neighbour_df.columns= ["ins_tokens", "del_tokens", "left_neigh", "right_neigh", "left_token", "right_token"]
        self.neighbour = neighbour_df
        self.change_df = pd.concat([self.change, neighbour_df], sort=False, axis=1)
        
class Change:
    def __init__(self, token, start, end, left_context, right_context):
        self.token = token
        self.start = start
        self.end = end
        self.left = left_context
        self.right = right_context
        
def find_tokens(change, revision, epsilon_size):
    left_neigh = revision.vocabs_pos[revision.vocabs_pos <= change["left_neigh"]][-epsilon_size:]
    right_neigh = revision.vocabs_pos[revision.vocabs_pos >= change["right_neigh"]][:epsilon_size]
    ins_slice = slice(int(change["ins_start_pos"]), int(change["ins_end_pos"]+1) )
    del_slice = slice(int(change["del_start_pos"]), int(change["del_end_pos"]+1) )
    left_token = revision.content_str_vec[left_neigh]
    right_token = revision.content_str_vec[right_neigh]
    ins_tokens = revision.content_str_vec[ins_slice]
    del_tokens = revision.content_str_vec[del_slice]
    return pd.Series([tuple(ins_tokens), tuple(del_tokens), tuple(left_neigh), tuple(right_neigh), tuple(left_token), tuple(right_token)])


In [5]:
baseurl = "https://api.wikiwho.net/en/api/v1.0.0-beta/"
content = "Violence_against_Muslims_in_India"
revisions_url = os.path.join( baseurl, "rev_ids", content+"/")
params = {"editor": "true", "timestamp": "true"}
response = requests.get(revisions_url, params= params)
revisons_list = response.json()["revisions"]

In [6]:
all_content_url = os.path.join(baseurl, "all_content", content +"/")
params = { "o_rev_id": "true", "editor": "false", "token_id": "true", "in": "true", "out": "true" }
all_rev_data = requests.get(all_content_url, params= params)
all_tokens_mama = all_rev_data.json()["all_tokens"]

In [7]:
%%time
test_wiki = Wiki(2345, "a test wiki", all_tokens=4)
test_wiki.init_revisions(revisons_list)
test_wiki.add_all_token(all_tokens_mama) 
epsilon_size = 6
# del all_tokens_mama

CPU times: user 7.49 s, sys: 16 ms, total: 7.5 s
Wall time: 7.5 s


In [8]:
def get_contents(baseurl, content, start_rev_id, end_rev_id=""):
    content_url = os.path.join(baseurl, "rev_content", content, str(start_rev_id)+"/")
    if end_rev_id:
        content_url = os.path.join(content_url, str(end_rev_id)+"/")
    params = { "o_rev_id": "false", "editor": "false", "token_id": "true", "in": "false", "out": "false" }
    rev_contents = requests.get(content_url, params= params).json()["revisions"]
    return rev_contents

In [9]:
# %%time
# epsilon_size = 6
# test_wiki.revisions.iloc[0].content = pd.DataFrame(list(rev_contents[0].values())[0]["tokens"])
# from_index = 0
# for rev_content in rev_contents[1:]:
#     from_rev = test_wiki.revisions.iloc[from_index]
#     to_rev = test_wiki.revisions.iloc[from_index+1]
#     from_index += 1
#     from_rev.deleted(to_rev)
#     from_rev.content["invocab"] = from_rev.content["str"].isin(vocab)
#     to_rev.content = pd.DataFrame(list(rev_content.values())[0]["tokens"])
#     to_rev.inserted_continuous_pos()
#     to_rev.inserted_neighbours()
#     from_rev.create_change_object(to_rev)
#     from_rev.append_neighbour_vec(epsilon_size)

In [10]:
def create_change(wiki, rev_contents, from_index, vocab, epsilon_size):
    for rev_content in rev_contents:
        try:
            f_rev = wiki.revisions.iloc[from_index]
            t_rev = wiki.revisions.iloc[from_index+1]
            from_index += 1
            f_rev.deleted(t_rev)
            f_rev.content["invocab"] = f_rev.content["str"].isin(vocab)
            tokens = list(rev_content.values())[0]["tokens"]
            tokens.insert(0, {'token_id':-1, 'str':  "{st@rt}"})
            tokens.append({'token_id':-2, 'str': "{$nd}"})
            t_rev.content = pd.DataFrame(tokens)
            t_rev.inserted_continuous_pos()
            t_rev.inserted_neighbours()
            f_rev.create_change_object(t_rev)
            f_rev.append_neighbour_vec(epsilon_size)
        except:
            print("problem in ", rev_content.keys() )
    return from_index
        

In [11]:
# %%time
# rev_contents = get_contents(baseurl, content, str(revisons_list[0]["id"]), str(revisons_list[1]["id"]))

# epsilon_size = 6
# test_wiki.revisions.iloc[0].content = pd.DataFrame(list(rev_contents[0].values())[0]["tokens"])
# from_index = 0

In [12]:
%%time
rev_contents = get_contents(baseurl, content, str(revisons_list[0]["id"]), str(revisons_list[1]["id"]))
epsilon_size = 6
tokens = list(rev_contents[0].values())[0]["tokens"]
tokens.insert(0, {'token_id':-1, 'str':  "{st@rt}"})
tokens.append({'token_id':-2, 'str': "{$nd}"})
test_wiki.revisions.iloc[0].content = pd.DataFrame(tokens)
from_index = 0
if len(revisons_list) > 200:
    step = 200
else:
    step = len(revisons_list)
start_index = from_index + 1
end_index = len(revisons_list)
for to_index in  range(start_index, end_index, step):
    try:
        rev_contents = get_contents(baseurl, content, str(revisons_list[(from_index+1)]["id"]), str(revisons_list[to_index]["id"]))
        create_change(test_wiki, rev_contents, from_index, vocab, epsilon_size)
        print("ran till", to_index)
        from_index = to_index - 1
    except:
        print("problem ", from_index)
to_index = from_index + (end_index-1)%step
rev_contents = get_contents(baseurl, content, str(revisons_list[(from_index+1)]["id"]), str(revisons_list[to_index]["id"]))
create_change(test_wiki, rev_contents, from_index, vocab, epsilon_size)
from_index = to_index - 1
rev_contents = get_contents(baseurl, content, str(revisons_list[(from_index+1)]["id"]), "")
create_change(test_wiki, rev_contents, from_index, vocab, epsilon_size)


ran till 1
problem in  dict_keys(['558137760'])
problem in  dict_keys(['558138007'])
problem in  dict_keys(['558138375'])
problem in  dict_keys(['558140223'])
problem in  dict_keys(['558140376'])
problem in  dict_keys(['558140784'])
problem in  dict_keys(['558155934'])
problem in  dict_keys(['558238335'])
problem in  dict_keys(['558244871'])
problem in  dict_keys(['558249320'])
problem in  dict_keys(['558256275'])
problem in  dict_keys(['558256307'])
problem in  dict_keys(['558397658'])
problem in  dict_keys(['558531547'])
problem in  dict_keys(['558603673'])
problem in  dict_keys(['558605394'])
problem in  dict_keys(['558605478'])
problem in  dict_keys(['558605732'])
problem in  dict_keys(['558605803'])
problem in  dict_keys(['558608390'])
problem in  dict_keys(['558610600'])
problem in  dict_keys(['558610905'])
problem in  dict_keys(['558613621'])
problem in  dict_keys(['558639241'])
problem in  dict_keys(['558640600'])
problem in  dict_keys(['558640982'])
problem in  dict_keys(['558

In [None]:
with open(content+".pkl", "wb") as file:
    pickle.dump(test_wiki, file)

### reading the change object and clustering.

In [None]:
with open(content+".pkl", "rb") as file:
    wiki = pickle.load(file)

In [None]:
change_objects = []
wiki.revisions[:99].apply(lambda revision: change_objects.append(revision.change))
change_df = pd.concat(change_objects, sort=False, keys=wiki.revisions.index)

In [None]:
def get_word_vecs(tokens):
    in_vocab_tokens = set(tokens) & set(wiki_vec.vocab)
    if in_vocab_tokens:
        return wiki_vec[in_vocab_tokens].sum(axis=0, keepdims=True)
    else:
        return np.zeros((1, wiki_vec.vector_size))

In [None]:
%%time
change_vecs_list = []
change_token_s = change_df["ins_tokens"] + change_df["del_tokens"]
change_token_s.apply(lambda token_set: change_vecs_list.append(get_word_vecs(token_set)))

change_matrix = np.concatenate(change_vecs_list, axis=0)


In [None]:
%%time
neigh_vecs_list = []
neighbour_s = change_df['left_token'] + change_df['right_token']
neighbour_s.apply(lambda token_set: neigh_vecs_list.append(get_word_vecs(token_set)))

neighbour_matrix = np.concatenate(neigh_vecs_list, axis=0)

In [None]:
from_rev = wiki.revisions.iloc[0] 
to_rev = wiki.revisions.iloc[1]


In [None]:
# def neighbours(change, revision, wiki_vec, epsilon_size):
#     vocabs_pos = np.argwhere( revision.content["invocab"].values)
#     left = vocabs_pos[vocabs_pos < change["ins_starting_pos"]][:epsilon_size]
#     right = vocabs_pos[vocabs_pos > change["ins_end_pos"]][:epsilon_size]
#     neighbor_tokens = from_rev.content.str.values[np.r_[left,right]]
#     neighbour_vec = wiki_vec[neighbor_tokens].sum(axis=0)
#     combined_neighbour_vec = np.r_[left, right, neighbor_tokens, neighbour_vec]
#     return combined_neighbour_vec  
# added_pos = test_wiki.revisions[3317921].added_pos
# start_token_id = test_wiki.revisions[3317921].content["token_id"].values[added_pos[:,0]-1]
# end_token_id = test_wiki.revisions[3317921].content["token_id"].values[added_pos[:,1]+1]
# first_last_token = test_wiki.revisions[3317921].content["added"].values[[0,-1]]

# test_wiki.revisions[3317921].content["str"].values[np.s_[start_pos,end_pos[1:]]]

# test_wiki.revisions[3317921].content["str"].values[start_pos[0][0]:end_pos[1][0]+1]

# np.compress(test_wiki.revisions[3317921].content["removed"].values, test_wiki.revisions[3317921].content["str"].values)

### Do clustering

In [None]:
from sklearn.cluster import KMeans

In [None]:
km = KMeans(n_clusters= 30)
clusters = km.fit(neighbour_matrix)

In [None]:
cluster_s = pd.Series(clusters.labels_, index= change_df.index)
change_df["cluster"] = cluster_s
change_grouped = change_df.groupby("cluster")

In [None]:
change_grouped.size()

In [None]:
change_grouped.get_group(1)[["ins_tokens", "del_tokens", "left_neigh", "right_neigh"]].head(20)