In [2]:
import  requests
import json

In [6]:
def json_load(res):
  t = json.dumps(res.json())
  return json.loads(t)

headers = {
        "Accept": "application/json",
        "app_id": "8beabadc",
        "app_key": "4d85f3e2e7cd293da9a811f156c99841"
    }
word_id = 'abusive'
url_entries = 'https://od-api.oxforddictionaries.com/api/v2/entries/en/' + word_id + '?strictMatch=true'
res_entries = requests.get(url_entries, headers = {'app_id' : headers['app_id'], 'app_key' : headers['app_key']})

senses_examples = json_load(res_entries)

In [14]:
senses_examples['results'][0]['lexicalEntries'][0]['entries'][0]['senses'][0]['examples'][0]['text']

'the goalkeeper was sent off for using abusive language'

In [60]:

import os
import re
import nltk
import pandas as pd
from typing import Union

class Tokenizer():
    """
    This class tokenizes any given text in file format.

    Parameters
    ----------
    root_dir: str,
    The root directory of the folder containing the file wit the text

    file_name: str,
    The name of the file containing the text data

    stopwords: bool,
    Include stopword removal in the tokenization

    normalized: bool,
    Apply normalization on the tokens -> Lemmatization
    """

    CURRENT_PATH = './'
    EMPTY_FILE = 'file.txt'

    def __init__(
        self,
        root_dir:str=CURRENT_PATH,
        file_name:str=EMPTY_FILE,
        series:Union[pd.Series, str]=None,
        index:int=None,
        stopwords:bool=False,
        normalized:bool=False
    ):

        self._sentence_pattern = re.compile("[.:,!;\n]\s", re.U)
        self.root_dir = root_dir
        self.filename = file_name
        self.stopwords = stopwords
        self.normalized = normalized
        self.tokens_filled = False
        self._tokens = []
        self._df = series
        self.doc_index = index
        self.full_text = None

        if self.stopwords:
            ## Stop words loading
            from nltk.corpus import stopwords
            try:
                nltk.data.find('corpora/stopwords')
            except LookupError:
                nltk.download('stopwords')

            self.stop_words = set(stopwords.words('english'))

        if self.normalized:
            ## Lemmatizer loading
            from nltk.stem import WordNetLemmatizer
            try:
                nltk.data.find('corpora/wordnet.zip')
            except LookupError:
                nltk.download('wordnet')

            self.lematizer = WordNetLemmatizer()


        if not os.path.exists(os.path.join(root_dir, file_name)) and self._df is None:
            raise ValueError(
                'Tokenizer must be initialized with either a pandas Series or a text file'
            )

        elif self._df is not None:
            self.list_docs = []
            try:
                if isinstance(self._df, str):
                    self.list_docs.append(self._df)
                    self.full_text = self._df
                else:
                    self.list_docs = self._df.tolist()
            except Exception:
                raise ValueError(
                    'The given Series is not valid'
                )


        elif not os.path.exists(os.path.join(root_dir, file_name)):
            raise ValueError(
                'The given path: {} does not exist'.format(os.path.join(root_dir, file_name))
            )
        else:
            file = os.path.join(root_dir, file_name)
            with open(file) as f: self.full_text = f.read()

    @staticmethod
    def _preprocess_text(text:str) -> str:
        space_pattern = '\s+'
        new_line = '\n+'
        mention_regex = '@[\w\-]+'
        non_word_char = '[^\w]'
        underscore = '_[\w]+'
        html_tags = '<.*?>'

        parsed_text = re.sub(space_pattern, ' ', text)
        parsed_text = re.sub(html_tags, ' ', parsed_text)
        parsed_text = re.sub(new_line, ' ', parsed_text)
        parsed_text = re.sub(mention_regex, '', parsed_text)
        parsed_text = re.sub(non_word_char, ' ', parsed_text)
        parsed_text = re.sub(r"\bو(.*?)\b", r'\1', parsed_text)
        parsed_text = re.sub('([0-9]+)', '', parsed_text)
        parsed_text = re.sub(underscore, ' ', parsed_text)

        return parsed_text


    def _split_into_tokens(self, text:str, flatten:bool):
        """
        Initiate the splitting process and updates the tokens list
        """
        processed_text = self._preprocess_text(text)

        for line in re.split(self._sentence_pattern, processed_text):
            if line.strip() == '':
                continue
            else:
                line = line.lower()
                self._tokens.append(line.split())

        # Flatten the tokens
        if flatten:
            self._tokens = [token for item in self._tokens for token in item]

        return self._tokens

    def _stopwords(self, doc:list):
        # Remove stopwords:
        return list(filter(lambda x: not x in self.stop_words, doc))

    def _normalize(self, doc:list):
        # Normalize the tokens with lemmatization
        normalized_tokens = list(map(lambda x: self.lematizer.lemmatize(x, pos='v'), doc))
        normalized_tokens = list(map(lambda x: self.lematizer.lemmatize(x, pos='n'), normalized_tokens))
        normalized_tokens = list(map(lambda x: self.lematizer.lemmatize(x, pos='a'), normalized_tokens))

        return normalized_tokens

    def get_tokens(self):
        """
        Returns the list of all tokens after applying stopword removal and normalization
        """
        if (self._df is not None) and (isinstance(self._df, pd.Series)):
            if self.stopwords and self.normalized:
                for index, doc in enumerate(self.list_docs):
                    self._split_into_tokens(text=doc, flatten=False)

                    self._tokens[index] = self._stopwords(self._tokens[index])
                    self._tokens[index] = self._normalize(self._tokens[index])
                    self._tokens[index] = list(filter(lambda x: len(x) > 1, self._tokens[index]))

            else:
                [self._split_into_tokens(doc, False) for doc in self.list_docs]


        else:
            self._split_into_tokens(self.full_text, flatten=True)
            if self.stopwords:
                self._tokens = self._stopwords(self._tokens)
            if self.normalized:
                self._tokens = self._normalize(self._tokens)

        self.tokens_filled = True
        return self._tokens

    def _check_tokens(self):
        if not self.tokens_filled:
            raise ValueError(
                'There is no tokens to process'
            )

    def most_common_tokens(self, num:int) -> list:
        self._check_tokens()

        from collections import Counter
        counter = Counter(self._tokens)
        return counter.most_common(num)


In [63]:
tokenizer = Tokenizer(series='an intense feeling of deep affection', stopwords=True)

In [111]:
def json_load(res):
  t = json.dumps(res.json())
  return json.loads(t)

word_id = 'Washington'
url_entries = 'https://od-api.oxforddictionaries.com/api/v2/entries/en/' + word_id + '?strictMatch=true'
res_entries = requests.get(url_entries, headers = {'app_id' : headers['app_id'], 'app_key' : headers['app_key']})

json_load(res_entries)

{'id': 'washington',
 'metadata': {'operation': 'retrieve',
  'provider': 'Oxford University Press',
  'schema': 'RetrieveEntry'},
 'results': [{'id': 'Washington',
   'language': 'en-gb',
   'lexicalEntries': [{'entries': [{'grammaticalFeatures': [{'id': 'proper',
         'text': 'Proper',
         'type': 'Properness'}],
       'pronunciations': [{'audioFile': 'https://audio.oxforddictionaries.com/en/mp3/washington_1_gb_1.mp3',
         'dialects': ['British English'],
         'phoneticNotation': 'IPA',
         'phoneticSpelling': 'ˈwɒʃɪŋt(ə)n'}],
       'senses': [{'definitions': ['a state of the north-western US, on the coast of the Pacific Ocean; population 6,549,224 (est. 2008); capital, Olympia. It became the 42nd state in 1889.'],
         'id': 'm_en_gbus1142220.006',
         'semanticClasses': [{'id': 'us_state', 'text': 'Us_State'}],
         'shortDefinitions': ['state of north-western US, on Pacific coast']},
        {'definitions': ['the capital of the US; population 

In [131]:
def json_load(res):
  t = json.dumps(res.json())
  return json.loads(t)

def prepare_api(word:str):
    headers = {
        "Accept": "application/json",
        "app_id": "8beabadc",
        "app_key": "4d85f3e2e7cd293da9a811f156c99841"
    }
    word_id = word
    url_entries = 'https://od-api.oxforddictionaries.com/api/v2/entries/en/' + word_id + '?strictMatch=true'
    res_entries = requests.get(url_entries, headers = {'app_id' : headers['app_id'], 'app_key' : headers['app_key']})
    url_sentences = 'https://od-api.oxforddictionaries.com/api/v2/sentences/en/' + word_id + '?strictMatch=true'
    res_sentences = requests.get(url_sentences, headers = {'app_id' : headers['app_id'], 'app_key' : headers['app_key']})

    senses_examples = json_load(res_entries)
    sentences_examples = json_load(res_sentences)

    return senses_examples, sentences_examples

def run_words(word:str):
    from itertools import chain
    senses_examples, sentences_examples = prepare_api(word)

    try:
        senses_examples['results']
    except KeyError:
        raise ValueError(
            'No resutls for senses'
        )
    try:
        sentences_examples['results']
    except KeyError:
        raise ValueError(
            'No resutls for senteces'
        )

    api_call_senses = senses_examples['results'][0]['lexicalEntries'][0]['entries'][0]['senses']
    sentences = sentences_examples['results'][0]['lexicalEntries'][0]['sentences']
    senses = []
    sense_with_examples = {}
    diff_sense_ids = []


    for el in sentences:
        diff_sense_ids.append(el['senseIds'][0])

    sense_ids = set(diff_sense_ids)

    def search(id):
        return list(sent['text'] for sent in sentences if sent['senseIds'][0] == id)

    for idx, sens in enumerate(api_call_senses):
        try:
            sense_with_examples['word'] = word
            sense_with_examples['sense'] = sens['id']
            sense_with_examples['definition'] = sens['definitions'][0]
            examples_for_senses = list(ex['text'] for ex in sens['examples'])

            if sens['id'] in list(sense_ids):
                examples_sense = search(sens['id'])
                sense_with_examples['examples'] = list(chain(examples_sense, examples_for_senses))
        except KeyError:
            raise ValueError(
                'No examples for the word: {}'.format(word)
            )

        senses.append(sense_with_examples.copy())

    return senses

In [118]:
senses = run_words('people')

In [132]:
senses

[{'sense': 'm_en_gbus0763900.006',
  'definition': 'human beings in general or considered collectively',
  'examples': ["He's a very strong personality, but he talks to people as human beings and he's very honest.",
   'The Home Office had to treat these people as decent human beings and provide extra resources.',
   'We may well decide that it was the most evil act ever perpetrated by human beings on fellow people.',
   "We don't have nearly the amount of litter because people in general take pride in their city.",
   'It is high among the reasons why people consult general practitioners and neurologists.',
   'You can count the number of people at most general openings on your fingers and toes.',
   'If so, was his stringent demand only for disciples, or was it intended for people in general?',
   'Neither do I have a problem in general with people who wish to follow religious beliefs.',
   'As I grew older, my imaginary friends took on the personas of real living people.',
   'In ge

In [133]:
## Read words and load into json file:
with open('../News-data-project/target words/target.txt') as f:
    full_text = f.read()

all_words = []
for word in full_text.split('\n'):
    try:
        out_dict = run_words(word)
        all_words.append(out_dict)
    except ValueError:
        continue

all_words

[[{'word': 'state',
   'sense': 'm_en_gbus0989430.006',
   'definition': 'the particular condition that someone or something is in at a specific time',
   'examples': ['And final confirmation of my poor state of mind from lack of sleep came when Mark returned from going out.',
    'At times she is combative, at times submissive, according to the situation and her state of mind.',
    'A positive state of mind is also thought to be of great help in protecting against such problems.',
    "The cowboy is the archetypal American hero, and the western fits America's current state of mind.",
    'Hopefully, by moving to the north for a little while, my work will improve and so will my state of mind.',
    "His state of mind becomes even more troubled when a copy of Rebecca's childhood diary arrives anonymously in the post.",
    'He will under go a psychiatric examination to determine his state of mind at the time of the killings, he said.',
    'After that initial catharsis had passed she a

In [163]:
def look_for_poly():
    for word in all_words:
        if len(word) > 4:
            yield word

with open('../News-data-project/target words/senses_oxford_api.txt', 'w') as f:
    json.dump(list(look_for_poly()), f, indent=4)

In [162]:
list(look_for_poly())

[[{'word': 'state',
   'sense': 'm_en_gbus0989430.006',
   'definition': 'the particular condition that someone or something is in at a specific time',
   'examples': ['And final confirmation of my poor state of mind from lack of sleep came when Mark returned from going out.',
    'At times she is combative, at times submissive, according to the situation and her state of mind.',
    'A positive state of mind is also thought to be of great help in protecting against such problems.',
    "The cowboy is the archetypal American hero, and the western fits America's current state of mind.",
    'Hopefully, by moving to the north for a little while, my work will improve and so will my state of mind.',
    "His state of mind becomes even more troubled when a copy of Rebecca's childhood diary arrives anonymously in the post.",
    'He will under go a psychiatric examination to determine his state of mind at the time of the killings, he said.',
    'After that initial catharsis had passed she a

In [105]:
from nltk import pos_tag, word_tokenize

def search_word_tag(tagged_words:list, word_s:str):
    return next(word for word in tagged_words if word[0] == word_s)

nltk.download('punkt')

# for sens in senses:
#     print(sens['sense'])
#     print(sens['examples'])

for sens in senses:
    for ex in sens['examples']:
        tokens = word_tokenize(ex)
        tags = pos_tag(tokens)
        print(tags)
        print('\n')
        # print(search_word_tag(tags, 'love'))


[('That', 'DT'), ('year', 'NN'), ('he', 'PRP'), ('moved', 'VBD'), ('to', 'TO'), ('London', 'NNP'), ('but', 'CC'), ('his', 'PRP$'), ('love', 'NN'), ('for', 'IN'), ('Wales', 'NNP'), ('was', 'VBD'), ('strong', 'JJ'), ('and', 'CC'), ('he', 'PRP'), ('eventually', 'RB'), ('settled', 'VBD'), ('permanently', 'RB'), ('there', 'RB'), ('.', '.')]


[('His', 'PRP$'), ('love', 'NN'), ('for', 'IN'), ('children', 'NNS'), ('and', 'CC'), ('affection', 'NN'), ('for', 'IN'), ('the', 'DT'), ('sick', 'NN'), ('have', 'VBP'), ('endeared', 'VBN'), ('him', 'PRP'), ('to', 'TO'), ('all', 'DT'), ('.', '.')]


[('My', 'PRP$'), ('brother', 'NN'), (',', ','), ('and', 'CC'), ('his', 'PRP$'), ('real', 'JJ'), (',', ','), ('strong', 'JJ'), ('love', 'NN'), ('for', 'IN'), ('me', 'PRP'), ('that', 'DT'), ('was', 'VBD'), ('able', 'JJ'), ('to', 'TO'), ('pull', 'VB'), ('me', 'PRP'), ('back', 'RB'), ('into', 'IN'), ('the', 'DT'), ('world', 'NN'), ('I', 'PRP'), ('know', 'VBP'), ('.', '.')]


[('Each', 'DT'), ('one', 'CD'), ('is'

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aymanehachcham/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
