# Imports

## Libraries

In [15]:
import os
import re
import string
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pickle

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

import gensim.downloader as api
word2vec_model = api.load("word2vec-google-news-300")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Data

In [16]:
data_folder_path = '/text-mining/data/02_text_representation/Corpus-representacion'

In [17]:
def save_files_to_dict(folder_path):
    files_dict = {}
    for root, _, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            try:
                with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
                    files_dict[file_path] = f.read()
            except Exception as e:
                print(f"Could not read file {file_path}: {e}")
    return files_dict

In [18]:
data_dict = save_files_to_dict(data_folder_path)
data = list(data_dict.values())
len(data)

866

In [19]:
example_index = 527
print(data[example_index])

Xref: cantaloupe.srv.cs.cmu.edu sci.physics:50976 sci.electronics:53128
Newsgroups: sci.physics,sci.electronics
Path: cantaloupe.srv.cs.cmu.edu!crabapple.srv.cs.cmu.edu!fs7.ece.cmu.edu!europa.eng.gtefsd.com!howland.reston.ans.net!usenet.ins.cwru.edu!magnus.acs.ohio-state.edu!csn!teal.csn.org!et
From: et@teal.csn.org (Eric H. Taylor)
Subject: Holes: practical questions, was - Philosophical Question
Message-ID: <C584xJ.3nq@csn.org>
Followup-To: sci.physics
Summary: How do we preferentially amplify holes instead of electrons?
Keywords: holes electrons semi-conductors mobility
Sender: Eric H. Taylor
Nntp-Posting-Host: teal.csn.org
Organization: 4-L Laboratories
References: <31MAR199309335376@csa1.lbl.gov> <1993Mar31.194457.18742@watson.ibm.com> <12426@sun13.scri.fsu.edu>
Date: Fri, 9 Apr 1993 16:10:31 GMT
Expires: Sun, 9 May 1993 06:00:00 GMT
Lines: 48

In article <12426@sun13.scri.fsu.edu> jac@ds8.scri.fsu.edu (Jim Carr) writes:
>[...]
>I agree.  I come at this from nuclear physics, where

# Message Extraction

In [20]:
def extract_message_body(email_text):
    """
    Extracts the main message body from an email by removing headers, unnecessary metadata, 
    and signatures, while preserving quoted lines that provide meaningful context.
    """
    # Split the email into lines
    lines = email_text.splitlines()

    # Remove header (everything before the first blank line)
    blank_line_index = next((i for i, line in enumerate(lines) if line.strip() == ""), None)
    if blank_line_index is not None:
        lines = lines[blank_line_index + 1:]

    # Remove lines containing email addresses
    email_pattern = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
    lines = [line for line in lines if not email_pattern.search(line)]

    # Remove lines with only one to three capitalized words
    capitalized_words_pattern = re.compile(r"^([A-Z][a-z]+\s?){1,3}$")
    lines = [line for line in lines if not capitalized_words_pattern.match(line.strip())]

    # Remove lines after signature patterns
    signature_patterns = [
        r"^--\s*$",  # Standard signature delimiter
        r"^>--",     # Quoted signature delimiter
        r"^Kind regards",  # Common closing phrases
        r"^Best regards",
        r"^Sincerely",
        r"^Sent from my iPhone",
        r"^Sent from my BlackBerry",
        r"^Confidentiality Notice",  # Legal disclaimers
    ]

    filtered_lines = []
    skip_lines = False

    for line in lines:
        # Check for signature patterns
        if any(re.match(pattern, line.strip(), re.IGNORECASE) for pattern in signature_patterns):
            skip_lines = True
        # Stop skipping after a blank line
        if skip_lines and line.strip() == "":
            skip_lines = False
            continue
        # Skip lines if in the skip mode
        if skip_lines:
            continue

        filtered_lines.append(line)

    # Retain quoted lines unless they are irrelevant or part of a signature
    meaningful_quoted_lines = []
    for line in filtered_lines:
        if line.strip().startswith(">"):
            # Keep the line if it’s not part of a quoted signature or irrelevant
            if not re.match(r"^>--", line.strip()):
                meaningful_quoted_lines.append(line)
        else:
            meaningful_quoted_lines.append(line)

    # Join remaining lines to form the message body
    message_body = "\n".join(meaningful_quoted_lines).strip()
    return message_body


In [21]:
data = [extract_message_body(email) for email in data]
print(data[example_index])

>[...]
>I agree.  I come at this from nuclear physics, where one often discusses 
>particle-hole excitations and certain reactions have the effect of 
>applying an annihilation operator and creating a hole, and it is a 
>subtle question.  The longer one works with them, the more real they 
>become.  There are also quasi-particles, which raise the same sort 
>of question about how "real" the entity is.  The phenomenon is most 
>certainly a real one. 

OK, I've asked this before, and with a new thread on these lines, I
ask this again:

1: If a large hole current is run thru a resistor, will there be
   I^2 * R cooling instead of heating?

2: Can anyone design an amplifier that preferentially amplifies
   hole currents over normal electron currents?

3: what semiconductor materials have the highest ratio of
   hole mobility to electron mobility? (please quote actual
   test samples rather than estimates based on theory. Also,
   don't be limited to semiconductors: consider also insulators

# Preprocessing

In [22]:
def preprocess_text(text, remove_numbers=True, word_reduction="none"):
    """
    Preprocesses text by removing punctuation, numbers, stop-words,
    and optionally applying lemmatization or stemming.

    Args:
        text (str): The input text.
        remove_numbers (bool): Whether to remove numbers from the text.
        word_reduction (str): Type of word reduction to apply. Options are:
            "lemmatization", "stemming", or "none".

    Returns:
        str: The preprocessed text.
    """
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    
    # Remove numbers if required
    if remove_numbers:
        text = re.sub(r'\d+', '', text)
    
    # Convert text to lowercase
    text = text.lower()
    
    # Tokenize text into words
    words = text.split()
    
    # Remove stop-words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    # Apply word reduction if specified
    if word_reduction == "lemmatization":
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]
    elif word_reduction == "stemming":
        stemmer = PorterStemmer()
        words = [stemmer.stem(word) for word in words]
    # If word_reduction is "none" or invalid, no reduction is applied
    
    # Join words back into a single string
    return ' '.join(words)


In [23]:
processed_data = [preprocess_text(email) for email in data]
stemmed_data = [preprocess_text(email, word_reduction="stemming") for email in data]
print(stemmed_data[example_index])

agre come nuclear physic one often discuss particlehol excit certain reaction effect appli annihil oper creat hole subtl question longer one work real becom also quasiparticl rais sort question real entiti phenomenon certainli real one ok ive ask new thread line ask larg hole current run thru resistor r cool instead heat anyon design amplifi preferenti amplifi hole current normal electron current semiconductor materi highest ratio hole mobil electron mobil pleas quot actual test sampl rather estim base theori also dont limit semiconductor consid also insul resistor dielectr piezoelectr conductor magnet metal ceram magnetostrict etc note summar thread far state area hole detect vacuum hole particl exist presenc matter previou thread state hole exist certain semiconductor question natur aris hole current insid semiconductor vanish point semiconductor join conductor say copper dont want theoret discuss whether hole could exist insid metal conductor rather ask experiment discuss amplifi de

# Vectorizers

In [24]:
def document_to_vectors(doc, model):
    """
    Convert a document to vectors (average and sum) based on its word embeddings.
    
    Args:
        doc (str): Preprocessed document.
        model (gensim model): Pre-trained word embeddings model.
    
    Returns:
        tuple: (average_vector, sum_vector)
    """
    words = doc.split()
    word_vectors = [model[word] for word in words if word in model]
    if len(word_vectors) == 0:
        return np.zeros(model.vector_size), np.zeros(model.vector_size)
    word_vectors = np.array(word_vectors)
    return np.mean(word_vectors, axis=0), np.sum(word_vectors, axis=0)

def compute_text_representations(documents, method="tfidf"):
    """
    Compute text representations using the specified method.
    
    Args:
        documents (list of str): List of raw text documents.
        method (str): Representation method to use. Options are:
            "tf" - Term Frequency
            "tfidf" - Term Frequency-Inverse Document Frequency
            "word2vec_avg" - Word2Vec average embeddings
            "word2vec_sum" - Word2Vec sum embeddings
    
    Returns:
        array-like: The computed text representation matrix
    """
    # Preprocess documents
    preprocessed_docs = [preprocess_text(doc) for doc in documents]
    
    if method == "tf":
        vectorizer = TfidfVectorizer(use_idf=False, norm=None)
        return vectorizer.fit_transform(preprocessed_docs)
        
    elif method == "tfidf":
        vectorizer = TfidfVectorizer()
        return vectorizer.fit_transform(preprocessed_docs)
        
    elif method == "word2vec_avg":
        vectors = []
        for doc in preprocessed_docs:
            avg_vector, _ = document_to_vectors(doc, word2vec_model)
            vectors.append(avg_vector)
        return np.array(vectors)
        
    elif method == "word2vec_sum":
        vectors = []
        for doc in preprocessed_docs:
            _, sum_vector = document_to_vectors(doc, word2vec_model)
            vectors.append(sum_vector)
        return np.array(vectors)
        
    else:
        raise ValueError(f"Unknown method: {method}. Valid options are: 'tf', 'tfidf', 'word2vec_avg', 'word2vec_sum'")

In [25]:
tf_representations = compute_text_representations(stemmed_data, method="tf")
tfidf_representations = compute_text_representations(stemmed_data, method="tfidf")
word2vec_avg_representations = compute_text_representations(processed_data, method="word2vec_avg")
word2vec_sum_representations = compute_text_representations(processed_data, method="word2vec_sum")

# Saving

In [26]:
np.save("02_text_representations_tf.npy", tf_representations.toarray())
np.save("02_text_representations_tfidf.npy", tfidf_representations.toarray())
np.save("02_text_representations_word2vec_avg.npy", word2vec_avg_representations)
np.save("02_text_representations_word2vec_sum.npy", word2vec_sum_representations)