In [1]:
#meta 4/5/2021 How to read a markup file
#NLP, LSHTC (large scale hierarchical text classification)
#src data http://lshtc.iit.demokritos.gr/ LSHTC3
#src code based on https://github.com/Arf4t/Lshtc3_fasttext/blob/master/LSHTC3.ipynb

#input: sample.txt markup file
#output: df.pkl with documents and labels

#history
#4/5/2021 READ MARKUP FILE
#      OOP class read markup file

In [2]:
import time as time #to track performance time
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from joblib import load, dump

In [3]:
#global vars
sample_file = 'data/LSHTCv3_sample_wikipediaMediumOriginal-train.txt'

## How to: Read a Markup File
Define a class to read a markup file, parsing the required data about labels and documents

In [4]:
#define a class to read a markup file and extract its elements
class myTextReader:
    def __init__(self, file_path):
        self.file_path = file_path
        self.docno=[]
        self.labels=[]                      #parsing the required data about labels and documents
        self.data=[]
        
        soup=BeautifulSoup(open(self.file_path,'r',encoding='utf8'),'html.parser')
        
        for x in list(soup.find_all('docno')):
            self.docno.append(x.get_text())

        for x in list(soup.find_all('labels')):
            self.labels.append([ int(lab) for lab in x.get_text().split()])
            self.data.append(x.next_sibling.strip('\n'))

In [5]:
#track time
t0 = time.time()

#read file
sample = myTextReader(sample_file) #class __main__.myTextReader

#display a couple of entities
sample.docno[1:3], sample.labels[1:3], sample.data[1:3]

print("Processing time (in minutes): ", (time.time() - t0)/60)

Processing time (in minutes):  4.996458689371745e-05


In [6]:
#explore text - number of documents
sample.docno[-1], sample.labels[-1], sample.data[-1]

('5',
 [14661, 71999, 292915, 188756, 131368],
 "Andre Kirk Agassi (born April 29, 1970) is an American former World No. 1 professional tennis player who won eight Grand Slam singles tournaments and an Olympic gold medal in singles. Generally considered by critics and fellow players to be one of the greatest tennis players of all time, he has been called the best service returner in the history of tennis. Known for his unorthodox apparel and attitude, Agassi is often cited as one of the most charismatic players in the history of the game, and is credited for helping revive the popularity of tennis during the 1990s. He is married to fellow retired professional tennis player and multiple Grand Slam champion Steffi Graf. Agassi is, with Rod Laver, Don Budge, Fred Perry, Roy Emerson, and Roger Federer, one of only six men to have achieved a Career Grand Slam, one of only three since the beginning of the Open Era, and the only male player to have achieved a Career Golden Slam. In addition t

In [7]:
#make a dataframe
df = pd.DataFrame([sample.docno, sample.labels, sample.data]).T
df.columns = ['docno', 'labels', 'text']
df.set_index('docno', inplace=True)
print(df.shape)
print(df.dtypes)
df.head()

(5, 2)
labels    object
text      object
dtype: object


Unnamed: 0_level_0,labels,text
docno,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"[33692, 13402, 393382]",Alabama /\u02CC\u00E6l\u0259\u02C8b\u00E6m\u02...
2,[130762],Alain Connes (born 1 April 1947) is a French m...
3,"[352578, 395447, 27512, 157031]",Ayn Rand (born Alisa Zinov'yevna Rosenbaum; Fe...
4,"[390846, 395447, 276114]","Allan Dwan (April 3, 1885 – December 28, 1981)..."
5,"[14661, 71999, 292915, 188756, 131368]","Andre Kirk Agassi (born April 29, 1970) is an ..."


In [8]:
#save df
dump(df, 'output/LSHTCv3_sample.pkl')

['output/LSHTCv3_sample.pkl']

In [9]:
mystop

NameError: name 'mystop' is not defined

## Xtra

In [None]:
#$acxtra manually, replicate Class - read an html file
tiny_docno=[]
tiny_labels=[]                      #parsing the required data about labels and documents
tiny_data=[]
        
soup=BeautifulSoup(open(sample_file,'r',encoding='utf8'),'html.parser')

for x in list(soup.find_all('docno')):
    tiny_docno.append(x.get_text())

for x in list(soup.find_all('labels')):
    tiny_labels.append([ int(lab) for lab in x.get_text().split()])
    tiny_data.append(x.next_sibling.strip('\n'))
    
#results after reading file
tiny_docno[1:3], tiny_labels[1:3], tiny_data[1:3]

In [None]:
#$acxtra OOP class read markup file, take 1 - no param to specify file
class myText:
    def __init__(self):
        self.docno=[]
        self.labels=[]                      #parsing the required data about labels and documents
        self.data=[]
        
        soup=BeautifulSoup(open(sample_file,'r',encoding='utf8'),'html.parser')
        
        for x in list(soup.find_all('docno')):
            self.docno.append(x.get_text())

        for x in list(soup.find_all('labels')):
            self.labels.append([ int(lab) for lab in x.get_text().split()])
            self.data.append(x.next_sibling.strip('\n'))
            
#read file
sample = myText() #class __main__.myText

#display a couple of entities
sample.docno[1:3], sample.labels[1:3], sample.data[1:3]