In [1]:
from pymongo.mongo_client import MongoClient
from pymongo import TEXT
import pymongo
import string
from itertools import chain
import nltk
import re
import time
from Levenshtein import distance

## CONNECTION CHECK

In [2]:
uri = "mongodb+srv://duskmane:MQlA2Nm3dMxGhO3c@ads-cluster.tfi3k0w.mongodb.net/"
client = MongoClient(uri)

try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


In [3]:
client.list_database_names()

['ex_words_01', 'admin', 'local']

In [4]:
ex_words = client.ex_words_01.ex_words_01

In [5]:
ex_words.index_information()

{'_id_': {'v': 2, 'key': [('_id', 1)]},
 'word_1': {'v': 2, 'key': [('word', 1)]},
 'word_text': {'v': 2,
  'key': [('_fts', 'text'), ('_ftsx', 1)],
  'weights': SON([('word', 1)]),
  'default_language': 'english',
  'language_override': 'language',
  'textIndexVersion': 3}}

### Adding indexes

In [6]:
#ex_words.create_index('word')
#ex_words.create_index([('word', TEXT)])

### Adding multiwords inside the database since there is non at the beginning

In [7]:
# multi_words = ['future generation', 'greenhouse gas', 'economic landscape', 'environmental policy', 'Soil pollution', 'economic growth', 'sustainable lifestyle', 'environmental education', 'collective responsibility', 'critical thinking', 'printing press', 'scientific methods', 'European culture', 'ancient civilizations', 'sustainable energy', 'national parks', 'urban life', 'parliamentary system', 'positive change', 'international cooperation']
# for multi_word in multi_words:
#     ex_words.insert_one({"word": multi_word})

## DATA PREPROCESS

In [8]:
def get_words_from_file(file_name):
    with open(file_name) as f:
        lines = f.readlines()
    
    new_l = ''

    for line in lines:
        if line.strip():
            new_l += ' ' + (line.lower().strip())
            
    punc_removed = re.sub(r'[^\w\s-]','',new_l)
    tokenized_text = nltk.word_tokenize(punc_removed)
    return tokenized_text

In [9]:
def get_multiwords4(words, skip):
    hexagon_w = [f"{words[i]} {words[i+1]} {words[i+2]} {words[i+3]}" 
                 for i in range(0, len(words), (5 - skip)) if i < len(words) - (6 - skip)]
    
    return list(chain(hexagon_w))

In [10]:
def get_multiwords5(words, skip):
    hexagon_w = [f"{words[i]} {words[i+1]} {words[i+2]} {words[i+3]} {words[i+4]}" 
                 for i in range(0, len(words), (6 - skip)) if i < len(words) - (7 - skip)]
    
    return list(chain(hexagon_w))

In [11]:
def get_multiwords6(words, skip):
    hexagon_w = [f"{words[i]} {words[i+1]} {words[i+2]} {words[i+3]} {words[i+4]} {words[i+5]}" 
                 for i in range(0, len(words), (7 - skip)) if i < len(words) - (8 - skip)]
    
    return list(chain(hexagon_w))

In [12]:
def get_multiwords8(words, skip):
    hexagon_w = [f"{words[i]} {words[i+1]} {words[i+2]} {words[i+3]} {words[i+4]} {words[i+5]} {words[i+6]} {words[i+7]}" 
                 for i in range(0, len(words), (9 - skip)) if i < len(words) - (10 - skip)]
    
    return list(chain(hexagon_w))

In [13]:
def get_multiwords10(words, skip):
    hexagon_w = [f"{words[i]} {words[i+1]} {words[i+2]} {words[i+3]} {words[i+4]} {words[i+5]} {words[i+6]} {words[i+7]} {words[i+8]} {words[i+9]} " 
                 for i in range(0, len(words), (11 - skip)) if i < len(words) - (12 - skip)]
    
    return list(chain(hexagon_w))

In [14]:
file_name = 'environment_2k.txt'
max_multi_words = 3
words = get_words_from_file(file_name)
multiwords = get_multiwords5(words,max_multi_words)

In [15]:
print(f'Text length: {len(words)} words')
print(f'Unique words: {len(set(words))}')
print(f'Multiwords length: {len(multiwords)}')

Text length: 1999 words
Unique words: 794
Multiwords length: 665


## QUERY TESTING

**Using $in**

In [16]:
words_query = {"word": {"$in": words}}
results = list(ex_words.find(words_query))
results = [item['word'] for item in results]

In [17]:
len(results)

780

In [18]:
#results

In [19]:
words_not_found = []
for word in words:
    if word not in results:
        words_not_found.append(word)
        
words_not_found = list(set(words_not_found))

In [20]:
print(words_not_found)

['17th', 'multi-faceted', 'year-round', 'well-being', '19th', 'gameits', 'far-reaching', 'non-governmental', 'eco-friendly', 'human-induced', '1980', 'self-improvement', '14th', 'multi-billion-dollar']


In [21]:
len(words_not_found)

14

**Using $in for multiwords**

In [22]:
multiwords_query = {"word": {"$in": multiwords}}
results_mw = list(ex_words.find(multiwords_query))
results_mw = [item['word'] for item in results_mw]

In [23]:
len(results_mw)

0

In [24]:
results_mw

[]

## EXPLAIN

In [25]:
explain_result = ex_words.find({"word": {"$in": words}}).explain()
print(explain_result['executionStats']['executionTimeMillis'])
explain_result = ex_words.find(multiwords_query).explain()
print(explain_result['executionStats']['executionTimeMillis'])

5
1


## SINGLE WORDS COULD NOT BE FOUND

In [26]:
startt = time.time()

levenshtein_results = {}
levenshtein_threshold = 3

for word in words_not_found: 
    
    levenshtein_candidates = ex_words.find({"$text": {"$search": word}})

    levenshtein_candidates = [candidate["word"] for candidate in levenshtein_candidates]
    
    if any(char for char in word if char.isdigit() or char == '-'):
            
        clean_word = re.sub(r'[0-9-]', '', word)

        new_candidates = ex_words.find({"$text": {"$search": clean_word}})
        new_candidates = [candidate["word"] for candidate in new_candidates]

        all_candidates = list(set(levenshtein_candidates + new_candidates))
        
    else:
        all_candidates = levenshtein_candidates

    min_distance = float('inf')
    
    for candidate in all_candidates:
        
        current_distance = distance(word, candidate)
        
        if current_distance <= levenshtein_threshold and current_distance < min_distance:
            
            min_distance = current_distance
            levenshtein_results[word] = (candidate)

endt = time.time()

In [27]:
endt-startt

1.022698163986206

In [28]:
levenshtein_results

{'17th': 'th',
 'multi-faceted': 'multifaceted',
 'well-being': 'wellbeing',
 '19th': 'th',
 'non-governmental': 'nongovernmental',
 'eco-friendly': 'ecofriend',
 '14th': 'th'}

### MULTI WORD SEARCH

In [29]:
multiwords

['the environment is a precious',
 'a precious and fragile entity',
 'fragile entity that sustains life',
 'sustains life on earth as',
 'earth as the custodians of',
 'custodians of this planet it',
 'planet it is our responsibility',
 'our responsibility to protect and',
 'protect and preserve it for',
 'it for future generations the',
 'generations the alarming rate at',
 'rate at which human activities',
 'human activities are causing environmental',
 'causing environmental degradation has raised',
 'has raised concerns worldwide from',
 'worldwide from deforestation to pollution',
 'to pollution the signs of',
 'signs of environmental distress are',
 'distress are evident in this',
 'in this essay we will',
 'we will explore the importance',
 'the importance of protecting the',
 'protecting the environment the current',
 'the current threats it faces',
 'it faces and the actions',
 'the actions individuals and societies',
 'and societies can take to',
 'take to ensure a sustainabl

In [51]:
startt = time.time()

testing = 0
levenshtein_results_mw = {}
levenshtein_threshold = 2

min_distance = 9999

for word in multiwords: 

    candidates = []
    
    levenshtein_candidates = ex_words.find({"$text": {"$search": word}})

    listing = time.time()
    levenshtein_candidates = [candidate["word"] for candidate in levenshtein_candidates]
    testing += time.time()-listing
    
    if any(char for char in word if char.isdigit() or char == '-'):
            
        clean_word = re.sub(r'[0-9-]', '', word)

        new_candidates = ex_words.find({"$text": {"$search": clean_word}})
        new_candidates = [candidate["word"] for candidate in new_candidates]

        all_candidates = list(set(levenshtein_candidates + new_candidates))
        
    else:
        all_candidates = list(set(levenshtein_candidates))
    
    for candidate in all_candidates:
        
        if(len(candidate.split()) < 2):
            continue

        current_distance = distance(word, candidate)
        
        check_distance = len(candidate)+current_distance-len(word)
        
        if check_distance <= levenshtein_threshold:
            
            candidates.append(candidate)
        
    if candidates:
        levenshtein_results_mw[word] = candidates

endt = time.time()

In [52]:
levenshtein_results_mw

{'of environmental protection lies the': ['environmental policy'],
 'burning of fossil fuels releases': ['fossil fuels'],
 'fuels releases greenhouse gases such': ['greenhouse gas'],
 'environmental protection it entails meeting': ['environmental policy'],
 'environmental protection adopting sustainable lifestyle': ['environmental policy'],
 'to environmental protection overexploitation of': ['environmental policy'],
 'environmental degradation adopting circular economy': ['environmental policy'],
 'enforcing environmental policies and fostering': ['environmental policy'],
 'the sports popularity to drive': ['hard drive'],
 'football on the economic landscape': ['economic landscape'],
 'economic landscape is also noteworthy': ['economic landscape']}

In [53]:
endt-startt

31.8282368183136

In [54]:
testing

30.666704893112183

## Find the words inside the text

In [63]:
levenshtein_results_mw_list = list(levenshtein_results_mw.values())
levenshtein_results_mw_final = [sublist for sublist in levenshtein_results_mw]
levenshtein_results_list = list(levenshtein_results.values())
levenshtein_results_final = [sublist for sublist in levenshtein_results]
all_words_found = results + results_mw + levenshtein_results_final + levenshtein_results_mw_final
final_result = list(set(all_words_found))

In [70]:
print(final_result)

['instigating', 'observation', 'land', 'brings', 'well', 'about', 'significant', 'intricate', 'where', 'press', 'lead', 'peoples', 'enforcing', 'industrys', 'this', 'globally', 'forestry', 'centuries', 'thinking', 'focusing', 'preserve', 'passionate', 'range', 'fuels releases greenhouse gases such', 'additionally', 'responsibility', 'role', 'web', 'diversity', 'felt', 'should', 'enacting', 'history', 'adopting', 'beginning', 'south', 'into', 'but', 'knowledge', 'overexploitation', 'exploring', 'emissions', 'dissemination', 'job', 'new', 'wider', 'havens', 'science', 'informed', 'infrastructure', 'environmental protection it entails meeting', 'contracts', 'modern', 'responsible', 'collective', 'fragile', 'environmental degradation adopting circular economy', 'period', 'can', 'possible', 'protected', 'street', 'poverty', 'endangering', 'discovery', 'embracing', 'status', 'beautiful', 'it', 'contests', 'fervor', 'too', 'disposal', 'equitable', 'across', 'enduring', 'artists', 'innovation'

In [74]:
for word in final_result:
    word_nums = []
    for i in range(len(words)):
        if(word == words[i].lower()):
            word_nums.append(i)

    print(f'The word positions of the word "{word}" are {word_nums} ')

The word positions of the word "instigating" are [345] 
The word positions of the word "observation" are [1150] 
The word positions of the word "land" are [311] 
The word positions of the word "brings" are [1405] 
The word positions of the word "well" are [811] 
The word positions of the word "about" are [1025] 
The word positions of the word "significant" are [195, 243, 609, 1040, 1636] 
The word positions of the word "intricate" are [153] 
The word positions of the word "where" are [1422, 1582, 1735] 
The word positions of the word "press" are [1060, 1180, 1183] 
The word positions of the word "lead" are [626] 
The word positions of the word "peoples" are [1281] 
The word positions of the word "enforcing" are [732] 
The word positions of the word "industrys" are [1839] 
The word positions of the word "this" are [17, 58, 858, 962, 1045, 1158, 1236, 1272, 1378] 
The word positions of the word "globally" are [1348] 
The word positions of the word "forestry" are [493] 
The word positions