# **Requirements**

In [None]:
# Install packages
!pip install -q gensim
!pip install -q nltk
!pip install -q sklearn

# **Dataset imports**

In [None]:
from google.colab import files

# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

In [None]:
!ls "/content/drive/My Drive/Bibliscope/"

In [1]:
# Load bible
# coding: utf-8

import json

with open('.data/bible.json', 'rb') as f:
  bible = json.load(f)
  
list(bible.items())[0][1]['text_bj']

u'Donne \xe0 ton serviteur un c\u0153ur plein de jugement pour gouverner ton peuple, pour discerner entre le bien et le mal, car qui pourrait gouverner ton peuple, qui est si grand ? "'

# **Bible objects**

# **Pre-processing**

In [2]:
import nltk
nltk.download('perluniprops')
from nltk import word_tokenize as nltk_word_tokenize, sent_tokenize as nltk_sent_tokenize

from nltk.corpus import stopwords
nltk.download('stopwords')
spw = stopwords.words('french')
spw += ['', '"', "''", "``"]
print(spw)

from nltk.stem.snowball import FrenchStemmer
stemmer = FrenchStemmer()

import string
pct = list(string.punctuation)
print(pct)

[nltk_data] Downloading package perluniprops to
[nltk_data]     /Users/antoinerose/nltk_data...
[nltk_data]   Package perluniprops is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/antoinerose/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[u'au', u'aux', u'avec', u'ce', u'ces', u'dans', u'de', u'des', u'du', u'elle', u'en', u'et', u'eux', u'il', u'je', u'la', u'le', u'leur', u'lui', u'ma', u'mais', u'me', u'm\xeame', u'mes', u'moi', u'mon', u'ne', u'nos', u'notre', u'nous', u'on', u'ou', u'par', u'pas', u'pour', u'qu', u'que', u'qui', u'sa', u'se', u'ses', u'son', u'sur', u'ta', u'te', u'tes', u'toi', u'ton', u'tu', u'un', u'une', u'vos', u'votre', u'vous', u'c', u'd', u'j', u'l', u'\xe0', u'm', u'n', u's', u't', u'y', u'\xe9t\xe9', u'\xe9t\xe9e', u'\xe9t\xe9es', u'\xe9t\xe9s', u'\xe9tant', u'\xe9tante', u'\xe9tants', u'\xe9tantes', u'suis', u'es', u'est', u'sommes', u'\xeates', u'sont', u'serai', u'seras', u'sera', u'seron

### Utils (tokenizers...)

In [3]:
import re

def word_tokenize(text, clean=True):
    tokens = nltk_word_tokenize(text, language='french')

    return list(tokens)


def sent_tokenize(text):
    sentences = nltk_sent_tokenize(text, language='french')

    # Complement tokenizer
    sentences_bis = []
    for sentence in sentences:
        new_split = re.split(r"[!.?]", sentence)
        new_split = filter(lambda e: len(e), new_split)
        sentences_bis += new_split

    return list(sentences_bis)


def clean_tokens(tokens, stem=True, remove_stopwords=True):
    # We remove d', l', qu'il etc. but not aujourd'hui
    text = "_".join(tokens)
    text = re.sub("\S{1,2}'", '_', text)
    tokens = text.split("_")

    # Stemmer
    if stem:
        for index, token in enumerate(tokens):
            if token and not token[0].isupper():
                tokens[index] = stemmer.stem(token)


    # We remove stopwords
    if remove_stopwords:
        tokens = list(filter(lambda e: e not in spw, tokens))

    # We remove punctuation
    tokens = list(filter(lambda e: e not in pct, tokens))

    # We lower tokens
    tokens = list(map(lambda e: e.lower(), tokens))

    # Final cleaning
    tokens = [token.replace(".", "") for token in tokens]

    return tokens

# Tests
for i in range(10):
    verse = list(bible.items())[i][1].get("text_bj")
    tokens = list(map(word_tokenize, sent_tokenize(verse)))
    tokens = list(map(clean_tokens, tokens))
    print(verse)
    print(tokens)

Donne à ton serviteur un cœur plein de jugement pour gouverner ton peuple, pour discerner entre le bien et le mal, car qui pourrait gouverner ton peuple, qui est si grand ? "
[[u'donne', u'serviteur', u'c\u0153ur', u'plein', u'jug', u'gouvern', u'peupl', u'discern', u'entre', u'bien', u'mal', u'car', u'pourr', u'gouvern', u'peupl', u'si', u'grand'], []]
Ton serviteur est au milieu du peuple que tu as élu, un peuple nombreux, si nombreux qu'on ne peut le compter ni le recenser.
[[u'ton', u'serviteur', u'milieu', u'peupl', u'\xe9lu', u'peupl', u'nombreux', u'si', u'nombreux', u'peut', u'compt', u'ni', u'recens']]
Salomon devint le gendre de Pharaon, le roi d'Égypte ; il prit pour femme la fille de Pharaon et l'introduisit dans la cité de David, en attendant d'avoir achevé de construire son palais, le Temple de Yahvé et le rempart de Jérusalem.
[[u'salomon', u'devint', u'gendr', u'pharaon', u'roi', u'\xe9gypte', u'prit', u'femm', u'fill', u'pharaon', u'introduis', u'dan', u'cit', u'david'

# **N-grams Approach - Stylistic statistical clustering**


### Histogram words per chapter

# **Word Embedding - Text Clustering**

### Creating Word Embedding Model

In [None]:
from textblob import TextBlob
import numpy as np
from gensim.models import Word2Vec, KeyedVectors


# Prepare sentences
print("[*] Preparing sentences...")
sentences = []
for ref, verse in bible.items():
  text = verse.get("text_bj")
  tokens = list(map(word_tokenize, sent_tokenize(text)))
  tokens = list(map(clean_tokens, tokens))
  sentences += tokens
print("[*] Sentences ready!")

vocab = {}
for sen in sentences:
    for word in sen:
        if not vocab.get(word, None):
            vocab[word] = 0
        vocab[word] += 1

mean_count = sum([f for w, f in vocab.items()]) / len(vocab)
std_count = np.std([f for w, f in vocab.items()]) / len(vocab)
print("[*] Word Mean Count is {}".format(mean_count))
print("[*] Word Count Standard Deviation is {}".format(std_count))

# Set values for various parameters
num_features = 500  # Word vector dimensionality
min_word_count = 10  # Minimum word count
print("[*] A word has to appear at least {} times to be counted".format(min_word_count))
num_workers = 4  # Number of threads to run in parallel
window = 10  # The maximum distance between the current and predicted word within a sentence
downsampling = 1e-3  # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
model = Word2Vec(
    sentences,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=window,
    sample=downsampling
)
model.train(
    sentences,
    total_examples=len(sentences),
    epochs=10)

print("[*] Saving model...")
model.save("word2vec_model")
print("[*] Model saved!")


In [None]:
if not model:
  model = Word2Vec.load("word2vec_model")

model.wv.most_similar("dieu")

### Clustering Vocab

In [None]:
# Clustering vocab
from sklearn.cluster import KMeans

# Set "k" (num_clusters) to be 1/5th of the vocabulary size, or an
# average of 5 words per cluster
word_vectors = model.wv.syn0
num_clusters = 150  # word_vectors.shape[0] / 7
print("[*] Working on {} clusters...".format(num_clusters))

# Initalize a k-means object and use it to extract centroids
kmeans_clustering = KMeans(n_clusters=num_clusters)
idx = kmeans_clustering.fit_predict(word_vectors)

# Create a Word / Index dictionary, mapping each vocabulary word to
word_centroid_map = dict(zip(model.wv.index2word, idx))

clusters = []
for cluster in range(num_clusters):
    print("\nCluster %d" % cluster)
    # Find all of the words for that cluster number, and print them out
    words = []
    for i in range(len(word_centroid_map.values())):
        if list(word_centroid_map.values())[i] == cluster:
            words.append(list(word_centroid_map.keys())[i])
    clusters.append(words)
    print(words)

In [None]:
lenghts = map(len, sentences)
max_len = max(lenghts)
print(max_len)

### Clustering texts with word embedding approach

In [None]:
# Prepare verses matrix
words = list(model.wv.vocab.keys())

verse_len = 50
verse_vector_len = verse_len * num_features

vectors_list = []
refs_list = []
for index, (ref, verse) in enumerate(bible.items()):
  text = verse.get("text_bj")
  tokens = list(map(clean_tokens, word_tokenize(text)))

  vector_list = []
  for index, token in enumerate(tokens):
    if index < verse_len:
      try:
        vector = model.wv[token][0]
        vector_list.append(vector)
      except:
        pass
  
  if not vector_list:
        continue
  verse_vector = np.concatenate(vector_list)
  refs_list.append(ref)
  
  # Prepare vector with the right length filled with zeros at the end
  full_verse_vector = np.zeros(verse_vector_len)
  full_verse_vector[:verse_vector.shape[0]] = verse_vector
              
  vectors_list.append(full_verse_vector)


In [18]:
# Cluster with KMeans

from sklearn.cluster import KMeans


num_clusters = 150
print("[*] Working on {} clusters...".format(num_clusters))

# Initalize a k-means object
kmeans_clustering = KMeans(
    n_clusters=num_clusters, 
    n_jobs=-1,  # all processors working
)
model = kmeans_clustering.fit(vectors_list)

targets = model.labels_
for i in range(100):
    if targets[i] == 0:
        ref = refs_list[i]
        verse = bible[ref]["text_bj"]
        print(verse)


[*] Working on 150 clusters...


KeyboardInterrupt: 

### Clustering texts with TF-IDF

In [8]:
from math import log
import numpy as np

verse_len = 50
tokenized_verses = list()
refs_list = list()
corpus_dict = dict()

# We first tokenize all verses and build corpus from all tokens
print("[*] Tokenizing all verses...")
current = 0
for i, (ref, verse) in enumerate(bible.items()):
    if i // (len(bible) / 10) > current:
        current = i // (len(bible) / 10)
        print("{}%".format(current * 10))
    text = verse.get("text_bj")
    tokenized_verse = clean_tokens(word_tokenize(text))
    if not tokenized_verse or not len(tokenized_verse):
        continue
    
    tokenized_verses.append(tokenized_verse)
    refs_list.append(ref)
    
    for token in tokenized_verse:
        if not corpus_dict.get(token, None):
            corpus_dict[token] = 0
        corpus_dict[token] += 1

print("[*] Preparing corpus dict...")
corpus_index_dict = dict()
for index, (key, value) in enumerate(corpus_dict.items()):
    corpus_index_dict[key] = {
        "index": index,
        "inverted_freq": log(len(refs_list) / value),
    }

# We build matrix
print("[*] Building bible matrix...")
X = np.empty((0, len(corpus_dict)))
current = 0
for i, tokenized_verse in enumerate(tokenized_verses):
    if i // (len(tokenized_verses) / 100) > current:
        current = i // (len(tokenized_verses) / 100)
        print("{}%".format(current))
    vector = np.zeros(len(corpus_dict))
    for token in tokenized_verse:
        try:
            index = corpus_index_dict[token]["index"]
            inverted_freq = corpus_index_dict[token]["inverted_freq"]
            
            vector[index] += 1 * inverted_freq
        except:
             pass
    X = np.concatenate([X, [vector]])

print(X)

[*] Tokenizing all verses...
10%
20%
30%
40%
50%
60%
70%
80%
90%
[*] Preparing corpus dict...
[*] Building bible matrix...
1%
2%
3%
4%
5%
6%
7%
8%
9%
10%
11%
12%
13%
14%
15%
16%
17%
18%
19%
20%
21%
22%
23%
24%
25%
26%
27%
28%
29%
30%
31%
32%
33%
34%
35%
36%
37%
38%
39%
40%
41%
42%
43%


KeyboardInterrupt: 

In [10]:
import pickle

with open("./tfidf_bible.pkl", "w") as f:
    pickle.dump([X, refs_list], f)
print("[*] Bible TFIDF matrix saved")

[*] Bible TFIDF matrix saved


In [42]:
import pickle
with open("./tfidf_bible.pkl", "r") as f:
    [X, refs_list] = pickle.load(f)
    
nb_of_verses = 5000
X = X[:nb_of_verses]
refs_list = refs_list[:nb_of_verses]

In [None]:
# Cluster with KMeans
from sklearn.cluster import KMeans

num_clusters = len(refs_list) / 10
print("[*] Working on {} clusters...".format(num_clusters))

# Initalize a k-means object
kmeans_clustering = KMeans(
    n_clusters=num_clusters, 
    n_jobs=-1,  # all processors working
)
model = kmeans_clustering.fit(X)

targets = model.labels_
print("[*] Clustering done!")

[*] Working on 500 clusters...


In [11]:
import pickle

with open("./tfidf_bible_model.pkl", "w") as f:
    pickle.dump(model, f)
print("[*] Bible TFIDF model saved")

[*] Bible TFIDF model saved


In [41]:
# Group clusters 
clusters = {i: [] for i in range(num_clusters)}
for i in range(nb_of_verses):
    ref = refs_list[i]
    verse = bible[ref]["text_bj"]
    clusters[targets[i]].append([ref + " " + verse])

for index, cluster in clusters.items():
    #if len(cluster) < 3:
     #   continue
    print("*"*50)
    print("\nCluster {}, length: {}\n".format(index, len(cluster)))
    for verse in cluster:
        print("_"*100)
        print(u"{}".format(verse[0]))
        print("\n")

**************************************************

Cluster 0, length: 1

____________________________________________________________________________________________________
sir_39_33 "Les œuvres du Seigneur sont toutes bonnes, il donne sa faveur à qui en a besoin, à l'heure propice.


**************************************************

Cluster 1, length: 1

____________________________________________________________________________________________________
2ki_15_14 Menahem fils de Gadi monta de Tirça, entra à Samarie, y frappa à mort Shallum fils de Yabesh et devint roi à sa place.


**************************************************

Cluster 2, length: 1

____________________________________________________________________________________________________
job_35_4 Eh bien! moi, je te répondrai, et à tes amis en même temps.


**************************************************

Cluster 3, length: 1

______________________________________________________________________________________

psa_63_12 Et le roi se réjouira en Dieu; qui jure par lui en tirera louange quand les menteurs auront la bouche fermée.


____________________________________________________________________________________________________
gen_4_3 Le temps passa et il advint que Caïn présenta des produits du sol en offrande à Yahvé,


____________________________________________________________________________________________________
job_30_29 Je suis devenu le frère des chacals et le compagnon des autruches.


____________________________________________________________________________________________________
job_30_28 Je marche, assombri, sans soleil, si je me dresse dans l'assemblée, c'est pour crier.


____________________________________________________________________________________________________
job_30_27 Mes entrailles bouillonnent sans relâche, les jours d'affliction m'ont atteint.


____________________________________________________________________________________________________
mat_26_

rut_2_23 Et elle resta parmi les servantes de Booz pour glaner jusqu'à la fin de la moisson des orges et de la moisson des blés, et elle habitait avec sa belle-mère.


**************************************************

Cluster 134, length: 1

____________________________________________________________________________________________________
2ch_13_9 N'avez-vous pas expulsé les prêtres de Yahvé, fils d'Aaron, et les lévites, pour vous faire des prêtres comme s'en font les peuples des pays : quiconque vient avec un taureau et sept béliers pour se faire donner l'investiture, peut devenir prêtre de ce qui n'est point Dieu!


**************************************************

Cluster 135, length: 1

____________________________________________________________________________________________________
num_11_16 Yahvé dit à Moïse : " Rassemble-moi soixante-dix des anciens d'Israël, que tu sais être des anciens et des scribes du peuple. Tu les amèneras à la Tente du Rendez-vous, où ils se tie

# ***Neural Network - Author prediction***