# Similarity by BM25
**Suárez Pérez Juan Pablo**

## Text Processing 

In [1]:
# Import the libraries needed
from nlp_functions import text_processing as tp
from nlp_functions import word_association as wa
from nlp_functions import article_segmentation as arse
from nlp_functions import mining_topics as mt

In [2]:
# Clean the corpus
tp.clean_corpus('./../EXCELSIOR_100_files/', './new_corpus/', 1)

In [3]:
# Get clean corpus
    # Delete HTML Tags.
    # Lower words.
text = tp.get_clean_text('./new_corpus/clean_e960402_mod_corpus.txt')

In [4]:
# Sentences segmentation
# Split Sentences
# Delete Special Caracters
sents = tp.sentence_tokenize(text)

In [5]:
# Delete Stop Words
clean_sents = tp.delete_stop_words_sents(sents, './nlp_functions/stopwords_and_lemmas/stopwords_es.txt')

In [6]:
# Lemmatize
lemmatize_sents = tp.lemmatize_sents(clean_sents, './nlp_functions/stopwords_and_lemmas/generate.txt')

In [7]:
# Get Vocabulary
vocabulary = tp.get_vocabulary_by_sents(lemmatize_sents)

Vocabulary size: 4770


## Mining Articles and Titles

In [8]:
titles = arse.get_titles('./../EXCELSIOR_100_files/', './titles/', 1)

In [9]:
articles = arse.get_articles('./../EXCELSIOR_100_files/', 1)

### Normalize articles

In [10]:
tokenize_articles = list()
for article in articles:
    tokenize_articles.append(tp.sentence_tokenize(article))

In [11]:
clean_articles = list()
for article in tokenize_articles:
    clean_articles.append(tp.delete_stop_words_sents(article, './nlp_functions/stopwords_and_lemmas/stopwords_es.txt'))

In [12]:
lemmatize_articles = list()
for article in clean_articles:
    lemmatize_articles.append(tp.lemmatize_sents(article, './nlp_functions/stopwords_and_lemmas/generate.txt'))

## Mining Topics

In [13]:
most_freq_words_prob = mt.get_most_freq_words_prob(lemmatize_sents, vocabulary)

In [14]:
most_freq_words = mt.get_most_freq_words(lemmatize_sents, vocabulary) 

In [15]:
context = wa.get_contexts_sents(vocabulary, lemmatize_sents)
bm25_vectors = wa.get_vectors(vocabulary, context)
idf = wa.get_idf(bm25_vectors)

In [16]:
most_freq_words_tf_idf = mt.get_most_freq_words_tf_idf(idf, most_freq_words_prob, vocabulary)

## Distribution of Topics in Documents

In [17]:
topics = ['méxico', 'gobierno', 'internet', 'política', 'dólar']
dist_topics = mt.get_distribution(titles, lemmatize_articles, topics)

In [18]:
for k, v in dist_topics.items():
    if k[-2:] == '\n':
        print(k[:-2])
    else:
        print(k)
    for k2, v2 in v.items():
        print('\t', k2, ':', v2)

semana doliente

	 méxico : 0.5
	 gobierno : 0.0
	 internet : 0.0
	 política : 0.5
	 dólar : 0.0
desde la medicina

	 méxico : 0.5
	 gobierno : 0.0
	 internet : 0.0
	 política : 0.0
	 dólar : 0.5
la nueva realidad

	 méxico : 0.6666666666666666
	 gobierno : 0.3333333333333333
	 internet : 0.0
	 política : 0.0
	 dólar : 0.0
hace 25 años

	 méxico : 0.0
	 gobierno : 1.0
	 internet : 0.0
	 política : 0.0
	 dólar : 0.0
hipótesis y dudas

	 méxico : 0.5
	 gobierno : 0.0
	 internet : 0.0
	 política : 0.5
	 dólar : 0.0
un lenguaje de papel

	 méxico : 0
	 gobierno : 0
	 internet : 0
	 política : 0
	 dólar : 0
hace 50 años

	 méxico : 0.0
	 gobierno : 0.0
	 internet : 0.0
	 política : 0.0
	 dólar : 1.0
¿amos más tolerantes?

	 méxico : 0.0
	 gobierno : 0.8333333333333334
	 internet : 0.0
	 política : 0.0
	 dólar : 0.16666666666666666
equilibrio doctrina-acción

	 méxico : 0.0
	 gobierno : 1.0
	 internet : 0.0
	 política : 0.0
	 dólar : 0.0
estamos mal, pero pagamos

	 méxico : 1.0
	 gobierno :