Installing libraries

In [None]:
import pandas as pd
import unicodedata
import re
from collections import defaultdict

Upload the CSV manually

In [None]:
from google.colab import files
uploaded = files.upload()

Saving los_miserables.csv to los_miserables (1).csv


In [None]:
#df = pd.read_csv('los_miserables.csv')
df = pd.read_csv('los_miserables.csv')

In [None]:
df['TEXTO']

Unnamed: 0,TEXTO
0,PRIMERA PARTE
1,Fantine
2,
3,
4,
...,...
10807,
10808,
10809,
10810,305


Standardize text:

*   Turning into lowercase
*   Remove accents, punctuation, numbers and extra spaces

In [None]:
def standardize_text(text):
    if isinstance(text, str):
        # Lowercase
        text = text.lower()
        # Accents
        text = unicodedata.normalize('NFD', text).encode('ascii', 'ignore').decode('utf-8')
        # Punctiation & numbers
        text = re.sub(r'[^a-záéíóúüñ\s]', '', text)
        # Extra spaces
        text = re.sub(r'\s+', ' ', text).strip()
        return text
    else:
        return ''

# Standarize df
df['TEXTO'] = df['TEXTO'].apply(lambda x: standardize_text(x))
df['TEXTO']

Unnamed: 0,TEXTO
0,primera parte
1,fantine
2,
3,
4,
...,...
10807,
10808,
10809,
10810,


Building a Vocabulary

In [None]:
#define classes
class WordEntry:
    def __init__(self, s):
        self.s = s  # word
        self.f = 1  # frequency

class Vocabulary:
    def __init__(self):
        self.vocabulary = defaultdict(WordEntry)

    def add_word(self, word):
        if word in self.vocabulary:
            self.vocabulary[word].f += 1
        else:
            self.vocabulary[word] = WordEntry(word)

# Creación del objeto Vocabulary
vocab = Vocabulary()



In [None]:
# Process the text and add words to the vocabulary
for text in df['TEXTO']:
    words = text.split()
    for word in words:
        vocab.add_word(word)

# Save the vocabulary in a dataframe
vocabulary_list = [{'Word': word_instance.s, 'Frequency': word_instance.f} for word_instance in vocab.vocabulary.values()]
vocab_df = pd.DataFrame(vocabulary_list)


Store the vocabulary in parquet format

In [None]:
vocab_df.to_parquet('vocabulario.parquet', index=False)

#Download the parquet file
from google.colab import files
files.download('vocabulario.parquet')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Do statistics:

*   How many words in original text
*   How many different words in vocabulary
*   Print the 100 most frequent words in vocabulary
*   Print the 100 least frequent words in vocabulary

In [None]:
# Use thr parquet file
vocab_df = pd.read_parquet('vocabulario.parquet')
vocab_df

Unnamed: 0,Word,Frequency
0,primera,55
1,parte,60
2,fantine,194
3,libro,26
4,primero,22
...,...,...
13093,gratuito,1
13094,cementerio,1
13095,encontrados,1
13096,sufrio,1


In [None]:
# Filters
total_words_in_text = df['TEXTO'].str.split().apply(len).sum()
unique_words_in_vocabulary = vocab_df.shape[0]
top_100_most_frequent_words = vocab_df.sort_values(by='Frequency', ascending=False).head(100)
top_100_least_frequent_words = vocab_df.sort_values(by='Frequency', ascending=True).head(100)

# Print the results
print(f"Words in original text: {total_words_in_text}")
print(f"Unique words in vocabulary: {unique_words_in_vocabulary}")
print(f"100 most frequent words in vocabulary:\n {top_100_most_frequent_words}")
print(f"100 least frequent words in vocabulary:\n {top_100_least_frequent_words}")



Words in original text: 109224
Unique words in vocabulary: 13098
100 most frequent words in vocabulary:
          Word  Frequency
16         de       5322
25         la       3917
38        que       3818
8          el       3394
21          y       3122
..        ...        ...
632       voz        111
137      alli        107
419      ojos        107
157       aun        105
12   monsenor        105

[100 rows x 2 columns]
100 least frequent words in vocabulary:
             Word  Frequency
13061   turbaria          1
13062   alboroto          1
13063  murmullos          1
32      interesa          1
13089   reservar          1
...          ...        ...
12978   acabemos          1
12931   escondio          1
12932   salvadme          1
12933      estad          1
12934  inflexion          1

[100 rows x 2 columns]
