# Imports

## Libraries

In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', None)  # No truncation for cell content
pd.set_option('display.max_rows', None)     # Show all rows
pd.set_option('display.max_columns', None)

import os
import re
import string

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

## Data

In [2]:
data_folder_path = '/text-mining/data/02_text_representation/Corpus-representacion'

In [3]:
def save_files_to_dict(folder_path):
    files_dict = {}
    for root, _, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            try:
                with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
                    files_dict[file_path] = f.read()
            except Exception as e:
                print(f"Could not read file {file_path}: {e}")
    return files_dict

In [4]:
data_dict = save_files_to_dict(data_folder_path)
data = list(data_dict.values())
len(data)

805

In [5]:
example_index = 527
print(data[example_index])

Xref: cantaloupe.srv.cs.cmu.edu rec.autos:102656 rec.autos.tech:53911 rec.autos.driving:16432 sci.electronics:53562
Path: cantaloupe.srv.cs.cmu.edu!crabapple.srv.cs.cmu.edu!fs7.ece.cmu.edu!europa.eng.gtefsd.com!gatech!darwin.sura.net!mojo.eng.umd.edu!russotto
From: russotto@eng.umd.edu (Matthew T. Russotto)
Newsgroups: rec.autos,rec.autos.tech,rec.autos.driving,sci.electronics
Subject: Re: electronic odometers (was: Used BMW Question ..... ???)
Date: 16 Apr 1993 03:51:56 GMT
Organization: Project GLUE, University of Maryland, College Park
Lines: 33
Message-ID: <1qlagsINNka0@mojo.eng.umd.edu>
References: <1qflgu$mpb@hpscit.sc.hp.com> <1993Apr14.153740.18542@nimbus.com> <1993Apr14.174857.28314@porthos.cc.bellcore.com>
NNTP-Posting-Host: tea.eng.umd.edu

In article <1993Apr14.174857.28314@porthos.cc.bellcore.com> dje@bmw535.NoSubdomain.NoDomain (Don Eilenberger) writes:
}In article <1993Apr14.153740.18542@nimbus.com>, jimiii@nimbus.com (Jim Warford) writes:

}|>  There are two simple proc

# Preprocessing

In [6]:
def extract_message_body(email_text):
    """
    Extracts the main message body from an email by removing headers, unnecessary metadata, 
    and signatures, while preserving quoted lines that provide meaningful context.
    """
    # Split the email into lines
    lines = email_text.splitlines()

    # Remove header (everything before the first blank line)
    blank_line_index = next((i for i, line in enumerate(lines) if line.strip() == ""), None)
    if blank_line_index is not None:
        lines = lines[blank_line_index + 1:]

    # Remove lines containing email addresses
    email_pattern = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
    lines = [line for line in lines if not email_pattern.search(line)]

    # Remove lines with only one to three capitalized words
    capitalized_words_pattern = re.compile(r"^([A-Z][a-z]+\s?){1,3}$")
    lines = [line for line in lines if not capitalized_words_pattern.match(line.strip())]

    # Remove lines after signature patterns
    signature_patterns = [
        r"^--\s*$",  # Standard signature delimiter
        r"^>--",     # Quoted signature delimiter
        r"^Kind regards",  # Common closing phrases
        r"^Best regards",
        r"^Sincerely",
        r"^Sent from my iPhone",
        r"^Sent from my BlackBerry",
        r"^Confidentiality Notice",  # Legal disclaimers
    ]

    filtered_lines = []
    skip_lines = False

    for line in lines:
        # Check for signature patterns
        if any(re.match(pattern, line.strip(), re.IGNORECASE) for pattern in signature_patterns):
            skip_lines = True
        # Stop skipping after a blank line
        if skip_lines and line.strip() == "":
            skip_lines = False
            continue
        # Skip lines if in the skip mode
        if skip_lines:
            continue

        filtered_lines.append(line)

    # Retain quoted lines unless they are irrelevant or part of a signature
    meaningful_quoted_lines = []
    for line in filtered_lines:
        if line.strip().startswith(">"):
            # Keep the line if it’s not part of a quoted signature or irrelevant
            if not re.match(r"^>--", line.strip()):
                meaningful_quoted_lines.append(line)
        else:
            meaningful_quoted_lines.append(line)

    # Join remaining lines to form the message body
    message_body = "\n".join(meaningful_quoted_lines).strip()
    return message_body


In [7]:
data = [extract_message_body(email) for email in data]
print(data[example_index])

}|>  There are two simple procedures for alterating any odometer.
}|> 
}|> 1. Mechanical driven odometer:
}|>     Remove the speedo cable from the transmission.
}|>     Attach a drill and run at max speed until the speedo turns over.
}|>     Continue until the desired mileage is reached.
}|> 
}|> 2. Electronically driven odometer:
}|>     Remove the sensor wire from the sensor.
}|>     Attach the Calibration out signal from an Oscope to the wire.
}|>     Run until the speedo turns over and attains the desired mileage.
}
}Dear Faster.. I kinda wonder.. have you ever tried version 2? On what?
}Since the sensor wire on a BMW feeds also into the computer.. and we
}don't know what signal voltage is expected from it.. bad things
}*could* happen... also since we don't know the pulse rate, we
}may damage the analog part of the speedo (yes.. BMW uses a combined
}instrument.. speed in analog, trip and total milage is digital) with
}the needle pegged up against the 160MPH stop..
}
}Just a thought

In [8]:
def preprocess_text(text, remove_numbers=True, use_lemmatization=True):
    """
    Preprocesses text by removing punctuation, numbers, stop-words,
    and applying lemmatization or stemming.

    Args:
        text (str): The input text.
        remove_numbers (bool): Whether to remove numbers from the text.
        use_lemmatization (bool): If True, lemmatize; if False, apply stemming.

    Returns:
        str: The preprocessed text.
    """
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    
    # Remove numbers if required
    if remove_numbers:
        text = re.sub(r'\d+', '', text)
    
    # Convert text to lowercase
    text = text.lower()
    
    # Tokenize text into words
    words = text.split()
    
    # Remove stop-words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    # Apply lemmatization or stemming
    if use_lemmatization:
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]
    else:
        stemmer = PorterStemmer()
        words = [stemmer.stem(word) for word in words]
    
    # Join words back into a single string
    return ' '.join(words)


In [9]:
data = [preprocess_text(email) for email in data]
print(data[example_index])

two simple procedure alterating odometer mechanical driven odometer remove speedo cable transmission attach drill run max speed speedo turn continue desired mileage reached electronically driven odometer remove sensor wire sensor attach calibration signal oscope wire run speedo turn attains desired mileage dear faster kinda wonder ever tried version since sensor wire bmw feed also computer dont know signal voltage expected bad thing could happen also since dont know pulse rate may damage analog part speedo yes bmw us combined instrument speed analog trip total milage digital needle pegged mph stop thought youve got oscilliscope connect sensor wire measure stuff way know expects
