# Stemming

- Stemming is chopping of word endings to get its root form.

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer, RegexpStemmer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
text = "The runner was running quickly, and it made him realize that he could run even faster."

1. PorterStemmer:
  - It's very basic, old and fast stemmer.
2. SnowballStemmer (english stemmer):
  - It offers slight improvement on porter stemmers logic and speed.
3. LancasterStemmer:
  - Simple but produces results with over stemming. (meaningless roots)
4. RegexpStemmer:
  - Uses regular expressions for stemming.

In [None]:
ps = PorterStemmer()
ls = LancasterStemmer()
ss = SnowballStemmer(language="english")
res = RegexpStemmer('ing$|s$|e$|able$|er$|ly$')

In [None]:
tokens = word_tokenize(text)
tokens

['The',
 'runner',
 'was',
 'running',
 'quickly',
 ',',
 'and',
 'it',
 'made',
 'him',
 'realize',
 'that',
 'he',
 'could',
 'run',
 'even',
 'faster',
 '.']

In [None]:
for token in tokens:
  print(f'token: "{token}"')
  print("porter-", ps.stem(token))
  print("lancaster-", ls.stem(token))
  print("snowball-", ss.stem(token))
  print("regexp-", res.stem(token))
  print()

token: "The"
porter- the
lancaster- the
snowball- the
regexp- Th

token: "runner"
porter- runner
lancaster- run
snowball- runner
regexp- runn

token: "was"
porter- wa
lancaster- was
snowball- was
regexp- wa

token: "running"
porter- run
lancaster- run
snowball- run
regexp- runn

token: "quickly"
porter- quickli
lancaster- quick
snowball- quick
regexp- quick

token: ","
porter- ,
lancaster- ,
snowball- ,
regexp- ,

token: "and"
porter- and
lancaster- and
snowball- and
regexp- and

token: "it"
porter- it
lancaster- it
snowball- it
regexp- it

token: "made"
porter- made
lancaster- mad
snowball- made
regexp- mad

token: "him"
porter- him
lancaster- him
snowball- him
regexp- him

token: "realize"
porter- realiz
lancaster- real
snowball- realiz
regexp- realiz

token: "that"
porter- that
lancaster- that
snowball- that
regexp- that

token: "he"
porter- he
lancaster- he
snowball- he
regexp- h

token: "could"
porter- could
lancaster- could
snowball- could
regexp- could

token: "run"
porter- run


# Lemmatization

- Lemmatization is more intelligent process that ensures resulting form is a valid word.

In [None]:
import spacy

In [None]:
nlp = spacy.load("en_core_web_sm") # english model

In [None]:
tokens = nlp(text)

for token in tokens:
  print(f'{token} | {token.lemma_}')

The | the
runner | runner
was | be
running | run
quickly | quickly
, | ,
and | and
it | it
made | make
him | he
realize | realize
that | that
he | he
could | could
run | run
even | even
faster | fast
. | .


# Doc to text conversion

In [None]:
!pip install docx2txt

Collecting docx2txt
  Downloading docx2txt-0.8.tar.gz (2.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: docx2txt
  Building wheel for docx2txt (setup.py) ... [?25l[?25hdone
  Created wheel for docx2txt: filename=docx2txt-0.8-py3-none-any.whl size=3960 sha256=19b6719bade2cd7a702170818e9a0ff8c95b2a0c86ac35dc1a0de909bc33abb1
  Stored in directory: /root/.cache/pip/wheels/22/58/cf/093d0a6c3ecfdfc5f6ddd5524043b88e59a9a199cb02352966
Successfully built docx2txt
Installing collected packages: docx2txt
Successfully installed docx2txt-0.8


In [None]:
import docx2txt as d2t

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
doc_path = "/content/drive/MyDrive/SPIDER/docs/Transformer.docx"
text_path = "/content/Transformer.txt"

doc = d2t.process(doc_path)
doc

'Hourglass Tokenizer for Efficient Transformer-Based 3D Human Pose Estimation\n\n\n\n\t\t\t\t\t\tWenhao Li1\tMengyuan Liu1,*\tHong Liu1,*\tPichao Wang2\tJialun Cai1\tNicu Sebe3 1National Key Laboratory of General Artificial Intelligence, Peking University, Shenzhen Graduate School 2Amazon Prime Video\t3University of Trento\n\n\t\t\t{wenhaoli,hongliu}@pku.edu.cn\tnkliuyifang@gmail.com pichaowang@gmail.com\tcjl@stu.pku.edu.cn\tniculae.sebe@unitn.it\n\n\n\n\n\nAbstract\n\nTransformers have been successfully applied in the field of video-based 3D human pose estimation. However, the high computational costs of these video pose transformers (VPTs) make them impractical on resource-constrained de- vices. In this paper, we present a plug-and-play pruning-and- recovering framework, called Hourglass Tokenizer (HoT), for efficient transformer-based 3D human pose estimation from videos. Our HoT begins with pruning pose tokens of re- dundant frames and ends with recovering full-length tokens, resul

In [None]:
with open(text_path, "w") as file:
  file.write(doc)

# Latex to text conversion

In [None]:
!pip install pypandoc

Collecting pypandoc
  Downloading pypandoc-1.12-py3-none-any.whl (20 kB)
Installing collected packages: pypandoc
Successfully installed pypandoc-1.12


In [None]:
import pypandoc

In [None]:
latex_path = "/content/drive/MyDrive/SPIDER/docs/acl.tex"
text_path = "/content/acl.txt"

In [None]:
def convert_latex_to_text(latex_code):
    try:
        # Convert LaTeX to plain text using pandoc
        text_content = pypandoc.convert_text(latex_code, 'plain', format='latex')
        return text_content
    except Exception as e:
        print(f"Error converting LaTeX to text: {e}")
        return None

# Define your LaTeX code
with open(latex_path) as file:
  latex_code = file.read()

# Convert LaTeX to text
text_content = convert_latex_to_text(latex_code)

# Print the text content
with open(text_path, "w") as file:
  file.write(text_content) # there are some warnings





In [None]:
text_content

'Introduction\n\nTransformer models have received increased attention over the recent\nyears. Much progress was achieved by improvements to model\narchitectures, components, and algorithms such as from RNN to LSTM or\nGRU\xa0, and from seq2seq\xa0 to attention\xa0, and GLM\xa02.0\xa0 to name a few.\nProgress also resulted from vastly increasing parameters, such as GPT-2\xa0\nwith 1.5 billion, GPT-3\xa0 with 175 billion, and Google Switch\xa0 with 1.6\ntrillion parameters among others.\n\nHowever, the training of a transformer model from scratch requires\namounts of training data and computing power by far exceeding the scope\nof individual application development. Furthermore, while pre-trained\nmodels perform well when applying basic NLP tasks to common and broadly\ndefined domains, they tend not to meet the requirements of more complex\ntasks applied to less common and more narrowly defined domains.\n\nA key element supporting a wide variety of applications is the\nsimplicity with wh

# Word Frequency Estimation

- Here we need to find different methods to figure out word frequencies in a text/doc.
- I am using the text from previously generated text.

- The **Counter** class is part of the **collections** module in Python, and it provides a convenient way to count the occurrences of elements in a collection (e.g., a list or a string).
- The **string** module provides additional functions for string manipulation.

In [None]:
from collections import Counter
import string

In [None]:
text_content

'Introduction\n\nTransformer models have received increased attention over the recent\nyears. Much progress was achieved by improvements to model\narchitectures, components, and algorithms such as from RNN to LSTM or\nGRU\xa0, and from seq2seq\xa0 to attention\xa0, and GLM\xa02.0\xa0 to name a few.\nProgress also resulted from vastly increasing parameters, such as GPT-2\xa0\nwith 1.5 billion, GPT-3\xa0 with 175 billion, and Google Switch\xa0 with 1.6\ntrillion parameters among others.\n\nHowever, the training of a transformer model from scratch requires\namounts of training data and computing power by far exceeding the scope\nof individual application development. Furthermore, while pre-trained\nmodels perform well when applying basic NLP tasks to common and broadly\ndefined domains, they tend not to meet the requirements of more complex\ntasks applied to less common and more narrowly defined domains.\n\nA key element supporting a wide variety of applications is the\nsimplicity with wh

In [None]:
def calc_word_freq(text):
    # Remove punctuation and convert to lowercase (punctuations are : . , ! ? ( ) - ... )
    text = text.translate(str.maketrans('', '', string.punctuation)).lower()

    # Split the text into words
    words = word_tokenize(text)

    # Count the frequency of each word
    word_frequencies = Counter(words)

    return word_frequencies

In [None]:
calc_word_freq(text_content)

Counter({'introduction': 2,
         'transformer': 13,
         'models': 9,
         'have': 4,
         'received': 1,
         'increased': 1,
         'attention': 2,
         'over': 2,
         'the': 164,
         'recent': 1,
         'years': 1,
         'much': 1,
         'progress': 2,
         'was': 7,
         'achieved': 1,
         'by': 17,
         'improvements': 1,
         'to': 75,
         'model': 26,
         'architectures': 3,
         'components': 1,
         'and': 61,
         'algorithms': 1,
         'such': 9,
         'as': 25,
         'from': 19,
         'rnn': 1,
         'lstm': 1,
         'or': 7,
         'gru': 1,
         'seq2seq': 1,
         'glm': 1,
         '20': 1,
         'name': 1,
         'a': 60,
         'few': 1,
         'also': 5,
         'resulted': 1,
         'vastly': 1,
         'increasing': 2,
         'parameters': 2,
         'gpt2': 1,
         'with': 14,
         '15': 1,
         'billion': 2,
         'gpt3'