In [None]:
import numpy as np
import pandas as pd
import re

file = open('data/wiki sample data.txt','r')

lines = file.readlines()
#print (lines)

df = pd.DataFrame(columns=['timestamp','user','minorEdit','pageLength','editDiff','comment','tags','entireEdit'])
i=0
for line in lines:
        entireEdit = line
        #split via brackets
        token = re.split('\(|\)',line)
        #print(token)
        timestamp = token[2].split('?')[0]
        timestamp = pd.to_datetime(timestamp)
        #split by ?
        user = token[2].split('?')[1:]
        user = ''.join(user)
        user = user[1:-1]
        m = token[4]
        m= m.split('.')[0]
        m= True if (m=='? m ') else False
        byte = token[5].split(' ')[0]
        byte = int(byte.replace(',',''))
        change = token[7]
        change = int(change.replace(',',''))
        comment = re.split('\. \. ',line)[-1]
        tagComment = comment.split('(undo)')
        #print (tagComment)
        comment = tagComment[0][1:-2]
        #undo is not at the end
        if (len(tagComment)>1):
            tag = tagComment[1][2:-2]
            tag = tag.split(':')
            if (len(tag)>1):
                tag = tag[1][1:]
                tag = tag.split(',')
            if (tag[0]==''):
                tag =[]
        else:
            tag=''
        
        #tag = tag.split(',')
        #print (tag)
        
        df.loc[i]=[timestamp,user,m,byte,change,comment,tag,entireEdit]
        i+=1

print (df.shape)
print (df.head)

In [None]:
import nltk
import re
commentsDf = df['comment']

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

def tokenizeAndStem(text):
    CommentsTokens=[]
    #for userComments in commentsDf:
    CommentsTokens = (nltk.word_tokenize(text))
    #filter out punctuations and numeric tokens
    filtered_tokens = []
    for token in CommentsTokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

totalvocab_stemmed = []
for i in commentsDf:
    stem = tokenizeAndStem(i)
    totalvocab_stemmed.extend(stem)

print (len(totalvocab_stemmed))

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.8,
                                 min_df=0, stop_words='english',
                                 use_idf=True,tokenizer=tokenizeAndStem, ngram_range=(1,3))

tfidf_matrix = tfidf_vectorizer.fit_transform(commentsDf)
print (tfidf_matrix.shape)
terms = tfidf_vectorizer.get_feature_names()
#print (terms)



In [None]:
from sklearn.cluster import KMeans

num_clusters = 5

km = KMeans(n_clusters=num_clusters)
km.fit(tfidf_matrix)

clusters = km.labels_.tolist()
#print (clusters)
df['clusters']  =clusters
#print (df.head())
print (df['clusters'].value_counts())
#print (km.cluster_centers_.argsort()[:,::-1])

from sklearn.manifold import MDS

MDS()

# convert two components as we're plotting points in a two-dimensional plane
# "precomputed" because we provide a distance matrix
# we will also specify `random_state` so the plot is reproducible.
mds = MDS(n_components=2, random_state=1)

pos = mds.fit_transform(tfidf_matrix.toarray())  # shape (n_components, n_samples)

xs, ys = pos[:, 0], pos[:, 1]

df['xs'] =xs
df['ys'] = ys

In [None]:
userList  = list(set(df['user']))
print (len(userList))

userNetworkDf = pd.DataFrame(columns = ['user','posX','posY'])


for name,group in df.groupby(df['user']):
    userNetworkDf.loc[len(userNetworkDf)] = [name,(group['xs']).mean(),(group['ys']).mean()]



In [None]:
edges = pd.DataFrame(columns=['from','to'])

for row in df.iterrows():
    tokens = row[1]['comment'].split()
    for name in userList:
        if name in row[1]['comment'] and name!=' ' and name!=row[1]['user']:
            edgefrom = userNetworkDf[userNetworkDf['user'] ==row[1]['user'] ].index.tolist()[0]
            edgeto = userNetworkDf[userNetworkDf['user'] ==name ].index.tolist()[0]
            edges.loc[len(edges)] = [edgefrom,edgeto]

     

In [None]:
from bokeh.models import GraphRenderer, StaticLayoutProvider, Oval
from bokeh.models import Plot, Range1d, MultiLine, Circle, HoverTool, TapTool, BoxSelectTool
from bokeh.models.graphs import from_networkx, NodesAndLinkedEdges, EdgesAndLinkedNodes
output_notebook()

node_indices = list(range(0,len(userNetworkDf)))
x = list(userNetworkDf['posX'])
y=list(userNetworkDf['posY'])

#hover 

hover = HoverTool(tooltips=None)
plot = figure(title="Graph Layout Demonstration", tools="",x_range = (min(x)-0.1,max(x)+0.1),
              y_range=(min(y)-0.1,max(y)+0.1),
              toolbar_location=None)
plot.add_tools(hover, TapTool(), BoxSelectTool())

#graph
graph = GraphRenderer()
#nodes
graph.node_renderer.glyph = Oval(height=0.03, width=0.06,fill_color='firebrick')
graph.node_renderer.selection_glyph = Circle(size=15, fill_color='blue')
graph.node_renderer.hover_glyph = Circle(size=15, fill_color='blue')
graph.node_renderer.tags = list(userNetworkDf['user'])

graph.node_renderer.data_source.data = dict(
    index=node_indices,
    user = list(userNetworkDf['user']))

#edges
graph.edge_renderer.data_source.data = dict(start=list(edges['from']),
                                            end=list(edges['to']))
graph.edge_renderer.selection_glyph = MultiLine(line_color='orange', line_width=5)
graph.edge_renderer.hover_glyph = MultiLine(line_color='orange', line_width=5)



### start of layout code

graph_layout = dict(zip(node_indices, zip(x, y)))
graph.layout_provider = StaticLayoutProvider(graph_layout=graph_layout)

graph.selection_policy = NodesAndLinkedEdges()
graph.inspection_policy = EdgesAndLinkedNodes()


plot.renderers.append(graph)


show(plot)