# Getting the English vocabulary of the EP full-text data for text analytics

## 1. Libraries

In [1]:
# librairies
import os
import re
import glob
import pandas as pd
from collections import Counter
# library to parse the xml content of the EP full text database
# library doc: https://docs.python.org/3/library/xml.etree.elementtree.html
import xml.etree.ElementTree as ET 

# disable warnings
import warnings
warnings.filterwarnings("ignore")

# language processing
import nltk
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/antoine/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## 2. Config

In [2]:
# location the the files - EP full text data 2020 edition
fpattern = r'../data/ep_full_text_database/2020_edition/EP{}.txt'
files = glob.glob(fpattern.format('*','*'))

In [3]:
# config
sep = '\t'
text_type = 'CLAIM'
lang = 'en'

new_col_names = ['publication_authority', # will always have the value "EP"
                 'publication_number', # a seven-digit number
                 'publication_kind', # see https://www.epo.org/searching-for-patents/helpful-resources/first-time-here/definitions.html for help.
                 'publication_date', # in format YYYY-MM-DD
                 'language_text_component', # de, en, fr; xx means unknown
                 'text_type', # TITLE, ABSTR, DESCR, CLAIM, AMEND, ACSTM, SREPT, PDFEP
                 'text' # it contains, where appropriate, XML tags for better structure. You will find the DTD applicable to all parts of the publication at: http://docs.epoline.org/ebd/doc/ep-patent-document-v1-5.dtd
                   ]

## 3. Fonction to process text data

In [4]:
def get_claim_text(f):
    """Open the file f to get all the claims in English"""
    
    # reading the file 
    print('Reading the file: {}'.format(f))
    df = pd.read_csv(f, sep = sep)
    
    # changing the column names
    df.columns = new_col_names
    
    # filtering to keep only claims in English (and only once)
    condition1 = df['text_type'] == text_type
    condition2 = df['language_text_component'] == lang
    df.drop_duplicates(subset = ['text'], inplace = True)
    df = df[condition1 & condition2]['text'].to_frame()
    
    return df

In [5]:
def parsing(text_xml):
    """Process the xml to get the raw text"""
            
    # removing the tags for bold text
    text_xml_modified = text_xml.replace('<b>', '')
    text_xml_modified = text_xml_modified.replace('</b>', '')

    # modifying the claim to be processed as a real xml
    text_xml_modified = "<data>" + text_xml_modified + '</data>'
    # we parse it with the ElementTree XML APIÂ¶
    root = ET.fromstring(text_xml_modified)
    # and this is how we access the text of the claims
    claims = root.findall("./claim/claim-text")
    # we store the claims in a list
    claims_text = [claim.text for claim in claims]
    
    return claims_text

## 4. Load data

In [6]:
%%time
# test with a single file
df = get_claim_text(files[0])

Reading the file: ../data/ep_full_text_database/2020_edition/EP2400000.txt
CPU times: user 1min 13s, sys: 6.27 s, total: 1min 20s
Wall time: 1min 19s


In [7]:
def fetch_all_vocabulary(files):
    l = []
    for f in files:
        text = get_claim_text(f)
        text['text'].apply(parsing)
        l.append(df)
    return pd.concat(l)

In [8]:
%%time
df = fetch_all_vocabulary(files)

Reading the file: ../data/ep_full_text_database/2020_edition/EP2400000.txt
Reading the file: ../data/ep_full_text_database/2020_edition/EP2700000.txt
Reading the file: ../data/ep_full_text_database/2020_edition/EP3000000.txt
Reading the file: ../data/ep_full_text_database/2020_edition/EP1700000.txt
Reading the file: ../data/ep_full_text_database/2020_edition/EP0100000.txt
Reading the file: ../data/ep_full_text_database/2020_edition/EP2300000.txt
Reading the file: ../data/ep_full_text_database/2020_edition/EP1800000.txt
Reading the file: ../data/ep_full_text_database/2020_edition/EP1500000.txt
Reading the file: ../data/ep_full_text_database/2020_edition/EP2500000.txt
Reading the file: ../data/ep_full_text_database/2020_edition/EP3300000.txt
Reading the file: ../data/ep_full_text_database/2020_edition/EP0200000.txt
Reading the file: ../data/ep_full_text_database/2020_edition/EP1100000.txt
Reading the file: ../data/ep_full_text_database/2020_edition/EP2800000.txt
Reading the file: ../data

## 5. Text preprocessing

In [None]:
# reshape 
documents = df['text'].apply(parsing)
documents = documents.apply(lambda x:x[0])
documents.dropna(inplace=True)

# tokenize
tokenizer = RegexpTokenizer("(?u)\\b[\\w-]+\\b")
documents = documents.apply(tokenizer.tokenize)

In [None]:
def remove_capitalisation(x):
    liste = [y.lower() for y in x]
    return liste

def remove_stop_words(x):
    stopset = set(stopwords.words('english'))
    liste = [y for y in x if not y in stopset]
    return liste

def remove_numbers(x):
    liste = [y for y in x if not any(char.isdigit() for char in y)]
    return liste

In [None]:
documents = documents.apply(remove_capitalisation)
documents = documents.apply(remove_stop_words)
documents = documents.apply(remove_numbers)
documents = documents.apply(lambda x:' '.join(x))

In [None]:
# display the most frequent words in the dataset
Counter(" ".join(documents).split()).most_common(100)