In [2]:
import re
import pandas as pd
import enchant
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt

In [None]:
def get_word_list(text):
    text = re.sub(r'[^\w\s]', '', text.lower())
    word_list = list(set(text.split()))
    word_list = [word for word in word_list if word not in stopwords.words('english')]
    word_list.sort(key=lambda w: text.index(w))
    return word_list

In [None]:
def get_sentence_list(text):
    sentences = re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', text)
    return sentences

In [None]:
def get_sentence_word_frequencies(data_series):
    sentence_word_frequencies = {}
    for i, text in data_series.items():
        sentences = get_sentence_list(text)
        for sentence in sentences:
            words = get_word_list(sentence)
            if words:
                sentence_key = ' '.join(words)
                if sentence_key not in sentence_word_frequencies:
                    sentence_word_frequencies[sentence_key] = 1
                else:
                    sentence_word_frequencies[sentence_key] += 1
    df = pd.DataFrame.from_dict(sentence_word_frequencies, orient='index', columns=['frequency'])
    df.index.name = 'sentence'
    df = df.sort_values('frequency', ascending=False)
    df.head(20).plot(kind='bar', legend=False)
    plt.ylabel('Frequency')
    plt.title('Top 20 sentence frequencies')
    plt.show()

In [None]:
def correct_punctuation(text):
    text = re.sub(r'\s+([?.!,;:])', r'\1 ', text)
    text = re.sub(r'([‘“„«])\s+', r'\1', text)
    text = re.sub(r'\s+([’”“»])', r'\1', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\.(\s*)([a-z])', lambda match: f".{match.group(1)}{match.group(2).capitalize()}", text)
    dictionary = enchant.Dict("en_US")
    words = text.split()
    corrected_text = ""
    for i, word in enumerate(words):
        if not dictionary.check(word):
            suggestions = dictionary.suggest(word)
            if len(suggestions) > 0:
                corrected_word = suggestions[0]
                pos = text.find(word)
                prefix = text[:pos]
                suffix = text[pos+len(word):]
                if suffix and not suffix[0].isspace():
                    corrected_word = f" {corrected_word}"
                corrected_text += f"{prefix}{corrected_word}{word[-1]}{suffix}"
            else:
                corrected_text += f"{word} "
        else:
            corrected_text += f"{word} "
    corrected_text = re.sub(r'\s+', ' ', corrected_text.strip())
    return text

In [None]:
def plot_word_frequency(used_word_sets):
    new_data_series = {}
    data_series = pd.read_json('test.json', typ='series')
    for i, text in data_series.items():
        word_set = set(get_word_list(data_series[i])) 
        if not word_set.issubset(used_word_sets):
            used_word_sets.update(word_set)
            new_data_series[i] = data_series[i]

    words = [word for text in new_data_series.values() for word in word_tokenize(text)]
    fdist = FreqDist(words)

    plt.figure(figsize=(15, 5))
    fdist.plot(30, cumulative=False)
    plt.title('Top 30 Most Common Words')
    plt.xlabel('Word')
    plt.ylabel('Frequency')
    plt.show()

In [None]:
used_word_sets = set()
removed_word_set = set()
new_data_series = {}
data_series = pd.read_json('test.json', typ='series')
for i, text in data_series.items():
    data_series[i] = correct_punctuation(data_series[i])
    word_set = set(get_word_list(data_series[i]))
    removed_word_set.update(used_word_sets - word_set)  # cập nhật set các từ bị xóa
    if not word_set.issubset(used_word_sets):
        used_word_sets.update(word_set)
        new_data_series[i] = data_series[i]

data_series = pd.Series(new_data_series)
print(f"Số lượng bộ từ vựng bị xóa: {len(removed_word_set)}")
print("Bộ từ vựng bị xóa:")
print(removed_word_set)

In [None]:
plot_word_frequency(used_word_sets)