In [8]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import defaultdict, Counter
import pandas as pd


In [18]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Define stop words
stop_words = set(stopwords.words('english'))

def remove_non_latin_characters(token):
    # Regular expression pattern to match non-Latin characters
    return re.sub(r'[^a-zA-Z0-9]', '', token)

def preprocess_texts(texts):
    # Convert texts to lowercase
    texts = [str(t) for t in texts]
    texts = [text.lower() for text in texts]

    # Tokenize texts
    tokenized_texts = [word_tokenize(text) for text in texts]

    # Remove non-Latin characters and stop words
    cleaned_texts = []
    for tokens in tokenized_texts:
        cleaned_tokens = [remove_non_latin_characters(token) for token in tokens if token not in stop_words]
        cleaned_texts.append([token for token in cleaned_tokens if token])  # Remove empty strings after cleaning

    # Flatten the list of all words to calculate document frequency
    all_words = [word for text in cleaned_texts for word in text]

    # Count document frequency
    word_freq = Counter(all_words)

    # Remove words with document frequency less than 5
    cleaned_texts = [[word for word in text if word_freq[word] >= 5] for text in cleaned_texts]

    # Filter out texts with length less than 2
    final_texts = [text for text in cleaned_texts if len(text) >= 2]

    return final_texts

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/avelynwong/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/avelynwong/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
def save_texts_to_file(processed_texts, filename):
    with open(filename, 'w') as file:
        for text in processed_texts:
            # Join the tokens back into a string
            file.write(' '.join(text) + '\n')

In [19]:
messages = pd.read_csv("message_df.csv")["text"]
messages = list(messages)

processed = preprocess_texts(messages)
save_texts_to_file(processed, "messages.txt")