In [2]:
import re
from nltk.tokenize import word_tokenize
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# Load the dataset and list the columns
file_path = './Tweets.csv'
# Load only the relevant columns for the analysis
relevant_columns = ['airline', 'airline_sentiment', 'negativereason', 'text']
tweets_data = pd.read_csv(file_path, usecols=relevant_columns)
# Redefine the custom tokenizer
def custom_tokenizer(text):
    # Define tokenization rules using regex
    rules = [
        (r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", "URL"),  # URLs
        (r"\b\w+@\w+\.\w+\b", "EMAIL"),  # Emails
        (r"(\d+)([a-zA-Z]+)", r"\1 \2"),  # Split numbers and letters
        (r"([a-zA-Z]+)(\d+)", r"\1 \2"),  # Split letters and numbers
        (r"([.,!?;:])", r" \1 "),  # Split punctuation
        (r"\b'\b", " ' "),  # Handle contractions (e.g., don't -> do n't)
        (r"\s+", " "),  # Remove extra spaces
        (r"@(\w+)", r"\1"), # Remove '@' but keep the username
        (r"#(\w+)", r"\1")  # Remove '@' but keep the username
    ]
    
    # Apply each rule sequentially
    for pattern, repl in rules:
        text = re.sub(pattern, repl, text)
    
    # Split by space for final tokens
    tokens = text.strip().split(" ")
    return tokens

# Function to compare custom tokenizer with NLTK's tokenizer
def compare_tokenizers(text, custom_tokens, nltk_tokens):
    differences = {
        "custom": [token for token in custom_tokens if token not in nltk_tokens],
        "nltk": [token for token in nltk_tokens if token not in custom_tokens],
    }
    return differences

# Pick 5 sample texts from the dataset
sample_texts = tweets_data['text'].sample(5, random_state=42).tolist()

# Analyze and save differences for each sample
output_lines = []
for i, text in enumerate(sample_texts, start=1):
    custom_tokens = custom_tokenizer(text)
    nltk_tokens = word_tokenize(text)
    differences = compare_tokenizers(text, custom_tokens, nltk_tokens)

    output_lines.append(f"Example {i}:")
    output_lines.append(f"Original Text: {text}")
    output_lines.append(f"Custom Tokenizer Output: {custom_tokens}")
    output_lines.append(f"NLTK Tokenizer Output: {nltk_tokens}")
    output_lines.append(f"Differences (Custom only): {differences['custom']}")
    output_lines.append(f"Differences (NLTK only): {differences['nltk']}")
    output_lines.append("")


# Save the results and paragraph to a text file
output_file_path = './tokenizer_comparison_results.txt'
with open(output_file_path, 'w') as f:
    f.write("\n".join(output_lines))

output_file_path


'./tokenizer_comparison_results.txt'