In [1]:
import warnings
warnings.filterwarnings("ignore")
from gensim.models import KeyedVectors, FastText
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import pickle
import operator

# import natural language toolkit
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation #, strip_numbers
from nltk.corpus   import stopwords
from nltk.tokenize import word_tokenize

import string

In [2]:
from modules.library.postgresql import PostgresQL
from modules.library.word_models import WordModels
from modules.library.document_similarity import DocumentSimilarity
from modules.library.document_models import DocumentModels

In [3]:
load_from_pickle = True
dump_to_pickle = False

In [4]:
if load_from_pickle:
    with open("document_analysis_variables.pkl", "rb") as f:
        e = pickle.load(f)
    documents = e[0]
    sample_model = e[1]
    document_texts = e[2]
    document_model = e[3]
    embedding = e[4]
    u = e[5]
    s = e[6]
    vh = e[7]
    document_embedding = e[8]
    ds = e[9]
    list_of_neighbors = e[10]

# Accessing the database

Using the module PostgresQL we will load the documents from our database and store them in a list 'documents'.

In [5]:
# connect to the database
pg = PostgresQL() 
pg.connect(database="eurlex_environment_only", user="postgres", password="dbpass")

In [6]:
if not load_from_pickle:
    documents = pg.execute("""
        SELECT * FROM documents;
    """)

# Word embeddnig model

In this section we will show how to use the module WordModels to load or train a word embedding model. In order to avoid runnig a time consuming commands, we will comment them out and use pickle instead to store and access already loaded or trained models. 

## Training a word embedding model

In case we want to use pre-trained models to train another model on our data, we can use a method in module WordModels called 'train' in the following way:

In [7]:
texts = ["dogs like chasing cats", "cats like chasing mice", "mice eat cheese", "cheese has holes", "earth flat", "nobody knows working"]

if not load_from_pickle:
    sample_model = WordModels()
    sample_model.train(texts, size=300, window=1, min_count=1, epochs=10) # texts is a list of stripped strings

In [8]:
sample_model.get_embedding().vocab.keys()

dict_keys(['d', 'o', 'g', 's', ' ', 'l', 'i', 'k', 'e', 'c', 'h', 'a', 'n', 't', 'm', 'r', 'f', 'b', 'y', 'w'])

In [9]:
type(sample_model)

modules.library.word_models.WordModels

In [10]:
type(sample_model.get_embedding())

gensim.models.keyedvectors.FastTextKeyedVectors

## Loading a word embedding model

In case model has already been trained either by the user or has been provided from another source, module WordModels enables us to load it and use it.

In [11]:
if not load_from_pickle:
    wiki_en_path = '../data/fasttext/wiki.en.align.vec'
    wiki_en_model = WordModels()
    wiki_en_model.load(wiki_en_path)

In [12]:
if load_from_pickle:
    with open("wwe_word_models.pkl", "rb") as f:
        wiki_en_model = pickle.load(f)

In [13]:
if dump_to_pickle:
    with open("wwe_word_models.pkl", "wb") as f:
        pickle.dump(wiki_en_model, f, protocol=-1)

In [14]:
# if not load_from_pickle:
#     wiki_bin_path ='../data/fasttext/wiki.en.bin' 
#     wiki_bin_model = WordModels()
#     wiki_bin_model.load(wiki_bin_path, model_type='fasttext')

In [15]:
# path_law = '../data/Law2Vec/Law2Vec.200d.txt'
# law_model = WordModels()

In [16]:
# law_model.load(path_law)

In [17]:
# with open("law_we.pkl", "wb") as f:
#     pickle.dump(law_model, f, protocol = -1)

In [18]:
# with open("law_we.pkl", "rb") as f:
#     law_model = pickle.load(f)

## Embedding documents

We have some documents saved in a list 'documents'. Let's embed them using DocumentModels module.

In [19]:
if not load_from_pickle:
    document_texts = [doc['document_text'] for doc in documents if len(doc['document_text'])>0]

In [32]:
stop_words = stopwords.words('english') + list(string.punctuation)
if not load_from_pickle:
    document_model = DocumentModels(wiki_en_model.get_model().wv, document_texts, stop_words)
    document_model.embed_documents(use_tfidf_weighing=True)

In [33]:
if not load_from_pickle:
    embedding = document_model.get_embedding()

In [22]:
# if not load_from_pickle:
#     u, s, vh = np.linalg.svd(embedding, full_matrices=True)

In [23]:
# first_singular_vector = vh[0,:]

In [24]:
# if not load_from_pickle:
#     for i in range(len(document_texts)):
#         embedding[i, :] = embedding[i, :] - first_singular_vector

## Document Similarity Analysis

In order to do analysis on a corpus of documents we will use module 'DocumentSimilarity'. Below are some examples of use:

In [34]:
if not load_from_pickle:
    document_embedding = document_model.get_embedding()
    ds = DocumentSimilarity(document_embedding)

In [35]:
ds.euclid_similarity(document_embedding[0], document_embedding[1])

1.3100564

In [36]:
if not load_from_pickle:
    list_of_neighbors = ds.k_nearest_neighbors(document_embedding[43877])

In [37]:
list_of_neighbors

[43877, 44089, 41353, 42629, 46425, 47842, 53465, 34301, 51042, 32548]

In [38]:
for i in list_of_neighbors:
    print(documents[i]['document_title'])
    print(" ")

Written Question No 945/88 by Mr Luc Beyer de Ryke to the Commission: Gas imports
 
Written Question No 1613/88 by Mr Bryan Cassidy to the Commission: Distillation into alcohol of surplus apples
 
WRITTEN QUESTION NO 2564/86 BY MR BARRY SEAL TO THE COMMISSION: GENERAL AVIATION LICENCES
 
WRITTEN QUESTION NO 1482/87 BY MR POL MARCK TO THE COMMISSION: PORTUGUESE CEREAL IMPORTS
 
WRITTEN QUESTION No. 2649/90 by Mr Mark KILLILEA to the Commission. Radon gas levels in the west of Ireland
 
WRITTEN QUESTION No. 1966/91 by Mr Wilfried TELKÄMPER to the Commission. Transposition into German law of the EIA directive
 
QUESTION No 80 (H-0467/94) by Petrus CORNELISSEN to the Commission. Transport of animals
 
WRITTEN QUESTION NO 219/81 BY MR PROVAN TO THE COMMISSION: TAXATION DIFFERENCES AFFECTING AGRICULTURE AND FORESTRY
 
WRITTEN QUESTION No. 1154/93 by Paul STAES to the Council. The ozone layer
 
QUESTION ECRITE NO 208/79 DE M. ANSQUER A LA COMMISSION: PRIX DES PRODUITS PETROLIERS
 


In [39]:
dump_to_pickle = True
if dump_to_pickle:
    with open("document_analysis_variables.pkl", "wb") as f:
        pickle.dump([documents, sample_model, document_texts, document_model, embedding, u, s, vh, document_embedding, ds, 
                     list_of_neighbors], f, protocol=-1)