In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
from lxml import etree
from tqdm import tqdm

def stemmer_xml2df2(fname):
    result = []
    
    # Extract the words
    context = etree.iterparse(fname, events=('end', ), tag=('word'))
    for event, elem in context:
        stem = None
        for a in elem.getchildren():
            if a.tag == 'analysis':
                stem = a.attrib['stem']
        result.append({'word': elem.attrib['value'], 'proposed_root': stem})
        
        # make iteration over context fast and consume less memory
        #https://www.ibm.com/developerworks/xml/library/x-hiperfparse
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]
    
    return pd.DataFrame(result)

def analyzer_xml2df2(fname):
    result = []
    
    # Extract the words
    context = etree.iterparse(fname, events=('end', ), tag=('word'))
    for event, elem in context:
        word = elem.attrib['value']
        #print(repr(word))
        if word != '':
            roots = []
            for a in elem.getchildren():
                if a.tag == 'analysis':
                    try:
                        roots.append(a.attrib['root'])
                    except:
                        pass
            roots = list(set(roots))
            if len(roots) == 0:
                roots.append('NOANALYSIS')
            result.append({'word': elem.attrib['value'], 'proposed_root': '\\'.join(roots)})
        
        # make iteration over context fast and consume less memory
        #https://www.ibm.com/developerworks/xml/library/x-hiperfparse
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]
    
    return pd.DataFrame(result)

In [None]:
def read_file_analyzer(in_file):
    data = analyzer_xml2df2(in_file)
    return(list(data['word']))

def read_file_stemmer(in_file):
    data = stemmer_xml2df2(in_file)
    return(list(data['proposed_root']))

def corpus(in_files, analyzer=True):

    for in_file in in_files:
        if analyzer:
            ws = read_file_analyzer(in_file)
        else:
            ws = read_file_stemmer(in_file)
        
        yield(ws)

In [None]:
# maak term/document matrix van 1 boek uit Fiqh met document per chapter

import glob

book_files = glob.glob('/home/jvdzwaan/data/tmp/adh/2018-10-23-Fiqh-stemmed-chapters/*.xml')
book_files.sort()
print(len(book_files))
print(book_files[0])

In [None]:
%%time
c = corpus(book_files, analyzer=False)
data = [' '.join(list(terms)) for terms in c]

In [None]:
data[1]

In [None]:
%%time
from sklearn.feature_extraction.text import CountVectorizer

def split(string):
    return string.split()

count_vect = CountVectorizer(input='content', lowercase=False, tokenizer=split)
x = count_vect.fit_transform(data)

In [None]:
x.shape

In [None]:
count_vect.vocabulary_.get(u'غرب')

In [None]:
%%time
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()
x = tfidf_transformer.fit_transform(x)
print(x.shape)

In [None]:
%%time
from sklearn.metrics.pairwise import cosine_similarity

result = cosine_similarity(x, x)

In [None]:
print(result)

In [None]:
%%time

# cluster chapters based on affinity propagation
from sklearn.cluster import AffinityPropagation

af = AffinityPropagation().fit(result)

In [None]:
from sklearn import metrics

cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_

n_clusters_ = len(cluster_centers_indices)

print('Estimated number of clusters: %d' % n_clusters_)
#print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
#print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
#print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
#print("Adjusted Rand Index: %0.3f"
#      % metrics.adjusted_rand_score(labels_true, labels))
#print("Adjusted Mutual Information: %0.3f"
#      % metrics.adjusted_mutual_info_score(labels_true, labels))
#print("Silhouette Coefficient: %0.3f"
#      % metrics.silhouette_score(X, labels, metric='sqeuclidean'))

In [None]:
import matplotlib.pyplot as plt
from itertools import cycle

plt.close('all')
plt.figure(1)
plt.clf()

colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
    class_members = labels == k
    cluster_center = result[cluster_centers_indices[k]]
    plt.plot(result[class_members, 0], result[class_members, 1], col + '.')
    plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
             markeredgecolor='k', markersize=14)
    for x in result[class_members]:
        plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col)

plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()

In [None]:
import os

from sklearn.metrics.pairwise import linear_kernel

def find_similar(tfidf_matrix, index, top_n = 5):
    cosine_similarities = linear_kernel(tfidf_matrix[index:index+1], tfidf_matrix).flatten()
    related_docs_indices = [i for i in cosine_similarities.argsort()[::-1] if i != index]
    return [(index, cosine_similarities[index]) for index in related_docs_indices][0:top_n]

def find_similar_threshold(tfidf_matrix, index, threshold=0.5):
    cosine_similarities = linear_kernel(tfidf_matrix[index:index+1], tfidf_matrix).flatten()
    related_docs_indices = [i for i in cosine_similarities.argsort()[::-1] if i != index]
    res = []
    for index in related_docs_indices:
        sim = cosine_similarities[index]
        if sim > threshold:
            res.append((index, sim))
    return res

for idx, sim in find_similar(x, 1, top_n=20):
    print(sim, os.path.basename(book_files[idx]))

In [None]:
for idx, sim in find_similar_threshold(x, 1):
    print(sim, idx)

In [None]:
book_files[8001]

In [None]:
%%time
from sklearn.metrics.pairwise import cosine_distances

from sklearn.manifold import TSNE

dist = cosine_distances(x, x)

X_embedded = TSNE(n_components=2, metric='precomputed').fit_transform(dist)
print(X_embedded.shape)

In [None]:
X_embedded[:,0]

In [None]:
plt.scatter(X_embedded[:,0], X_embedded[:,1])
plt.show()