# Text processing basics


## Sentence Segmentation

Sentence segmentation involves breaking down a text into individual sentences, typically separated by punctuation marks.


In [1]:
import nltk

text = "This is a sample text. It contains multiple sentences. Can we segment it?"
sentences = nltk.sent_tokenize(text)

print(sentences)

['This is a sample text.', 'It contains multiple sentences.', 'Can we segment it?']


## Lowercasing

Lowercasing converts all text to lowercase, ensuring uniformity and simplifying text processing.


In [19]:
text = "ThIs Is AN ExaMple Text."
lowercased_text = text.lower()

print(lowercased_text)

this is an example text.


## Stop Words Removal

Stop words are common words (e.g., "the," "and") that are often removed during text processing to focus on meaningful words.


In [3]:
from nltk.corpus import stopwords

nltk.download("stopwords", quiet=True)

text = "This is an example sentence with some stop words."
stop_words = set(stopwords.words("english"))

filtered_words = [word for word in text.split() if word.lower() not in stop_words]

print(filtered_words)

['example', 'sentence', 'stop', 'words.']


## Lemmatization

Lemmatization reduces words to their base or dictionary form, considering the context and applying morphological analysis.


In [4]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

words = ["rocks", "corpora", "cries"]
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

print(lemmatized_words)

['rock', 'corpus', 'cry']


## Stemming

Stemming reduces words to their stems or root form, often by removing suffixes, in a more heuristic approach.


In [5]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

words = ["running", "rocks", "beautifully"]
stemmed_words = [stemmer.stem(word) for word in words]

print(stemmed_words)

['run', 'rock', 'beauti']


## Byte-Pair Encoding (BPE)

BPE is a data compression technique used in NLP for tokenization. It breaks down words into subword units.


In [6]:
!pip install tokenizers

In [7]:
from tokenizers.processors import TemplateProcessing

special_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
temp_proc = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", special_tokens.index("[CLS]")),
        ("[SEP]", special_tokens.index("[SEP]")),
    ],
)

In [8]:
from tokenizers import Tokenizer
from tokenizers.normalizers import Sequence, Lowercase, NFD, StripAccents
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.models import BPE
from tokenizers.decoders import BPEDecoder

tokenizer = Tokenizer(BPE())
tokenizer.normalizer = Sequence([NFD(), Lowercase(), StripAccents()])
tokenizer.pre_tokenizer = Whitespace()
tokenizer.decoder = BPEDecoder()
tokenizer.post_processor = temp_proc

In [9]:
from tokenizers.trainers import BpeTrainer

In [10]:
import nltk
from nltk.corpus import gutenberg

nltk.download("gutenberg", quiet=True)
nltk.download("punkt", quiet=True)

trainer = BpeTrainer(vocab_size=5000, special_tokens=special_tokens)
shakespeare = [" ".join(s) for s in gutenberg.sents("shakespeare-macbeth.txt")]
tokenizer.train_from_iterator(shakespeare, trainer=trainer)








In [11]:
print(
    tokenizer.encode(
        "BPE is a data compression technique used in NLP for tokenization."
    ).tokens
)
print(
    tokenizer.encode(
        "Is this a danger which I see before me, the handle toward my hand?"
    ).tokens
)

['[CLS]', 'b', 'pe', 'is', 'a', 'd', 'at', 'a', 'com', 'pre', 'ss', 'ion', 'te', 'ch', 'ni', 'que', 'use', 'd', 'in', 'n', 'lp', 'for', 'to', 'ken', 'iz', 'ation', '.', '[SEP]']

['[CLS]', 'is', 'this', 'a', 'danger', 'which', 'i', 'see', 'before', 'me', ',', 'the', 'handle', 'toward', 'my', 'hand', '?', '[SEP]']


## Levenshtein edit distance

Edit distance measures the similarity between two strings by counting the minimum number of operations needed to transform one string into the other.

[Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance#Example)


In [12]:
!pip install python-Levenshtein

In [13]:
import Levenshtein

word1 = "kitten"
word2 = "sitting"
distance = Levenshtein.distance(word1, word2)
print(f"Edit distance between '{word1}' and '{word2}': {distance}")

Edit distance between 'kitten' and 'sitting': 3


# Task


The aim of is to count the 10 most frequent words in the plays presented in the `data.txt` file.


In [14]:
with open("data.txt") as f:
    data = f.read()
plays = data.split("\n")
plays

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'shakespeare-macbeth.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-caesar.txt']

In [15]:
plays_dict = {}

for play in plays:
    plays_dict[play] = gutenberg.raw(play)
    print(play, len(plays_dict[play]))

austen-emma.txt 887071

austen-persuasion.txt 466292

austen-sense.txt 673022

shakespeare-macbeth.txt 100351

shakespeare-hamlet.txt 162881

shakespeare-caesar.txt 112310


In [16]:
def top_frequent_words(text, topk=10):
    text = text.lower()

    tokenizer = nltk.tokenize.RegexpTokenizer("\w+")
    tokens = tokenizer.tokenize(text)

    stop_words = stopwords.words("english")
    lemmatizer = WordNetLemmatizer()

    words = [lemmatizer.lemmatize(token) for token in tokens]
    words = [word for word in words if word not in stop_words]
    freqs = nltk.FreqDist(words)
    topk_common = freqs.most_common(topk)
    return topk_common

In [17]:
top_words = {}
for play, text in plays_dict.items():
    top_words[play] = top_frequent_words(text)

In [18]:
with open("answer.csv", "w") as f:
    f.write("id,count\n")
    for play, counts in top_words.items():
        for i, count in enumerate(counts):
            f.write(f"{play}_{i},{count[1]}\n")