### Hiva's functions

In [2]:
########################################################
#### Synonym and derived forms
########################################################
########################################################
# Synonym (copied 240619)
# required libraries:
import nltk 
from nltk.corpus import wordnet 
import requests

def get_synonyms(word):
    url = f"https://api.datamuse.com/words?rel_syn={word}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        synonyms = [entry['word'] for entry in data]
        return synonyms
    else:
        print("Error:", response.status_code)
        return []

########################################################
# related words (copied 240619)
def get_related_words(word, limit=60, topics=None):
    url = f"https://api.datamuse.com/words?ml={word}&max={limit}"
    
    if topics:
        url += f"&topics={topics}"
    
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        related_words = [entry['word'] for entry in data]
        return related_words
    else:
        print("Error:", response.status_code)
        return []


########################################################
# A function to check if a word is in the list or not (copied 240619)    
def check_word_in_list(word, word_list):
    if word in word_list:
        return f"'{word}' is in the list."
    else:
        return f"'{word}' is not in the list."


In [3]:
import nltk
from nltk.stem import WordNetLemmatizer
from lemminflect import getInflection, getAllInflections    
getAllInflections('hide', upos='VERB')

{'VBD': ('hid',),
 'VBN': ('hidden',),
 'VBG': ('hiding',),
 'VBZ': ('hides',),
 'VB': ('hide',),
 'VBP': ('hide',)}

In [4]:
import lemminflect

word = 'hide'

# Get inflections for 'hide' as a verb
inflections = lemminflect.getAllInflections(word, upos='VERB')

# Add the base form 'hide' to the list of inflections
#inflections.append(word)

print("All inflections of 'hide' including base form:")
print(inflections)
type(inflections)


All inflections of 'hide' including base form:
{'VBD': ('hid',), 'VBN': ('hidden',), 'VBG': ('hiding',), 'VBZ': ('hides',), 'VB': ('hide',), 'VBP': ('hide',)}


dict

In [5]:
forms = list(inflections.values())

for item in forms:
    print(item[0], type(item[0]))

hid <class 'str'>
hidden <class 'str'>
hiding <class 'str'>
hides <class 'str'>
hide <class 'str'>
hide <class 'str'>


In [6]:
#### lemminflect does not return anything when given a non-verb word
inflections = lemminflect.getAllInflections('teacher', upos='VERB')
print(inflections)

{}


#### Given a word, "form_list" returns a list of all forms of the verb. If not a verb, it returns a list only contains the word itself.

In [72]:
import lemminflect

def verb_forms(word):
    # Get all inflections for the word as a verb
    inflections = lemminflect.getAllInflections(word, upos='VERB')
    
    # Check if the word or any of its inflections are in the list of verb inflections
    forms = list(inflections.values())
    form_set = set()
    for item in forms:
        form_set.add(item[0])
    if form_set == {}:
        return [word]
    else:
        return list(form_set)

verb_forms('apple')

verb_forms('underreport')


[]

### From here, working on the text cleaning functions

In [79]:
### Let's make a new collection of cleaning functions:

misreport = ['misreport', 'misreports', 'misreporting', 'misreported']
underreport = ['underreport', 'underreports', 'underreporting', 'underreported']
unreport = ['unreport', 'unreports', 'unreporting', 'unreported']
pre_selected_word_list = [misreport, underreport, unreport]

import re
def collect_verb(text):
    words = word_tokenize(text)
    pos_tags = pos_tag(words)
    verbs = [word for word, tag in pos_tags if tag.startswith('V')]
    return verbs

def rep_word_text(text):
    global pre_selected_word_list
    
    new_text = text
    for word_form_list in pre_selected_word_list:
        if len(word_form_list) != 1:
            #print(word_form_list)
            new_text = clean_word(new_text,word_form_list)
        else:
            new_text = text
            
    verb_list = collect_verb(new_text)
    if verb_list == []:
        return new_text
    else:
        for verb in verb_list:
            verb_form_list = verb_forms(lemmatizer.lemmatize(verb.lower(), pos='v'))
            print(lemmatizer.lemmatize(verb, pos='v'), verb_form_list)
            if len(verb_form_list) != 1:
                new_text = clean_word(new_text, verb_form_list)
            else:
                new_text = text
        return new_text
    
text = "He walked quickly to the store and bought some groceries. Vessels caught misreporting catches"
print(rep_word_text(text))

walk ['walk', 'walked', 'walks', 'walking']
buy ['buys', 'buy', 'buying', 'bought']
catch ['catch', 'catches', 'catching', 'caught']
He walk quickly to the store and buys some groceries. Vessels catch misreport catch


### Testing with some of our contents

#### Importing the articles from the file

In [80]:
import pandas as pd

# read the excel file
excel_data = pd.read_excel('Classification_test/df_content.xlsx')

new_data = excel_data.copy()

texts = new_data.iloc[:3]['Content'].tolist()
print(texts[0])

 Wednesday, 05 Jun, 2024 Sea Shepherd Global stands at the forefront of the fight against Illegal, Unreported, and Unregulated (IUU) fishing, deploying innovative strategies and international collaborations to protect marine biodiversity.  Illegal, Unreported, and Unregulated (IUU) fishing refers to fishing activities that do not comply with national, regional, or international fisheries conservation and management laws and regulations. These activities are conducted by vessels in various ways, including: 
In 2015, the United Nations General Assembly highlighted the grave issue of IUU fishing, recognizing it as a major threat on multiple levels: Threat to Marine Wildlife and Ecosystems: IUU fishing severely impacts marine biodiversity. It decimates fish populations, disrupts food chains, and results in high bycatch rates, where non-target species like dolphins, sharks, and turtles are unintentionally caught and killed. Bycatch from IUU fishing can be extensive, with hundreds of thousan

In [81]:
rep_word_text(texts[0])

stand ['stands', 'stood', 'stand', 'standing']
deploy ['deployed', 'deploys', 'deploy', 'deploying']
protect ['protect', 'protected', 'protects', 'protecting']
fish ['fishes', 'fish', 'fishing', 'fished']
fish ['fishes', 'fish', 'fishing', 'fished']
do ['doing', 'does', 'do', 'done', 'did']
comply ['complies', 'complying', 'complied', 'comply']
be ['be', 'was', 'is', 'being', 'am', 'been']
conduct ['conduct', 'conducted', 'conducts', 'conducting']
include ['included', 'include', 'including', 'includes']
highlight ['highlights', 'highlighting', 'highlight', 'highlighted']
recognize ['recognize', 'recognized', 'recognizes', 'recognizing']
fish ['fishes', 'fish', 'fishing', 'fished']
impact ['impacting', 'impact', 'impacted', 'impacts']
decimate ['decimating', 'decimates', 'decimated', 'decimate']
be ['be', 'was', 'is', 'being', 'am', 'been']
catch ['catch', 'catches', 'catching', 'caught']
kill ['kill', 'killed', 'killing', 'kills']
Bycatch []
be ['be', 'was', 'is', 'being', 'am', 'been'

detect ['detecting', 'detects', 'detected', 'detect']
have ['have', 'having', 'had', 'has']
be ['be', 'was', 'is', 'being', 'am', 'been']
arrest ['arresting', 'arrested', 'arrest', 'arrests']
be ['be', 'was', 'is', 'being', 'am', 'been']
apprehend ['apprehended', 'apprehends', 'apprehending', 'apprehend']
assist ['assists', 'assisted', 'assist', 'assisting']
be ['be', 'was', 'is', 'being', 'am', 'been']
fish ['fishes', 'fish', 'fishing', 'fished']
reserve ['reserving', 'reserved', 'reserves', 'reserve']
be ['be', 'was', 'is', 'being', 'am', 'been']
outlaw ['outlaw', 'outlawing', 'outlaws', 'outlawed']
be ['be', 'was', 'is', 'being', 'am', 'been']
use ['used', 'uses', 'use', 'using']
fish ['fishes', 'fish', 'fishing', 'fished']
have ['have', 'having', 'had', 'has']
stop ['stopping', 'stops', 'stop', 'stopped']
transmit ['transmitted', 'transmitting', 'transmit', 'transmits']
be ['be', 'was', 'is', 'being', 'am', 'been']
find ['finds', 'find', 'finding', 'found']
fin ['finned', 'fin', 'f

' Wednesday, 05 Jun, 2024 Sea Shepherd Global stands at the forefront of the fight against Illegal, Unreported, and Unregulated (IUU) fishes, deployed innovative strategies and international collaborations to protect marine biodiversity.\xa0 Illegal, Unreported, and Unregulated (IUU) fishes refers to fishes activities that doing not complies with national, regional, or international fisheries conservation and management laws and regulations. These activities are conduct by vessels in various ways, included: \nIn 2015, the United Nations General Assembly highlights the grave issue of IUU fishes, recognize it as a major threat on multiple levels: Threat to Marine Wildlife and Ecosystems: IUU fishes severely impacting marine biodiversity. It decimating fishes populations, disrupted food chains, and results in high bycatch rates, where non-targeting species like dolphins, sharks, and turtles are unintentionally catch and kill. Bycatch from IUU fishes can be extensive, with hundreds of thou

In [66]:
lemmatizer.lemmatize('engaging', pos='v')

'engage'