In [10]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from gensim.models import Word2Vec
from sklearn.preprocessing import LabelEncoder

# Download NLTK punkt tokenizer,stop words if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Define the stopwords list
stop_words = set(stopwords.words('english'))
#Define the lemmatizer
lemmatizer = WordNetLemmatizer()

def load_data_from_file(file_path):
    """
    Load dataset from a CSV file with semicolon delimiter and handle extra semicolons.
    :param file_path: Path to the CSV file.
    :return: DataFrame containing the dataset.
    """
    # Read the file with semicolon as delimiter and drop any extra empty columns
    data = pd.read_csv(file_path, sep=';', header=None)

    # Remove the first row (it contains 'v1;v2' which is not needed)
    data = data.iloc[1:].reset_index(drop=True)

    data = data.iloc[:, :2]

    data.columns = ['label', 'text']
    
    # Apply the cleaning function to the 'text' column
    data['cleaned_text'] = data['text'].apply(clean_text)
    
    # Strip any leading/trailing spaces from the 'cleaned_text' column and drop NaN rows
    data['cleaned_text'] = data['cleaned_text'].str.strip()
    data = data.dropna(subset=['cleaned_text'])
    
    return data


def clean_text(text):
    """
    Clean the text by removing unwanted characters (e.g., special symbols) but keeping numbers, meaningful punctuation, and apostrophes.
    :param text: Raw text.
    :return: Cleaned text.
    """
    # Remove unwanted punctuation (keep only apostrophes within words)
    text = re.sub(r"[^\w\s']", '', text)  # Remove punctuation keep apostrophes
    # Convert to lowercase
    text = text.lower()
    return text


def custom_tokenize(text):
    """
    :param text: Cleaned text
    :return: List of tokens
    """
    # Tokenize the text into words
    tokens = word_tokenize(text)
    return tokens

# Function to remove stop words from the tokens
def remove_stop_words(tokens):
    """
    Remove stop words from the list of tokens.
    """
    return [word for word in tokens if word not in stop_words]


def lemmatize_tokens(tokens):
    """ Lemmatize the tokens to redyce to their base form  """
    return [lemmatizer.lemmatize(word) for word in tokens]


def train_word2Vec_model(tokens_list_lemmatized):
    """ train a word2vec model based on the lemmatized tokens list """
    model = Word2Vec(tokens_list_lemmatized,min_count=1)
    return model

# Step 6: Generate Document Vectors
def get_document_vector(tokens, model):
    """
    Generate a document vector by averaging the Word2Vec embeddings of the tokens.
    :param tokens: List of tokens (words) from the document.
    :param model: Word2Vec model.
    :return: Document vector (average of word vectors)
    """
    valid_tokens = [word for word in tokens if word in model.wv]
    
    if len(valid_tokens) == 0:  # Handle cases where none of the tokens exist in the Word2Vec model
        return [0] * 100  # Return a zero vector if no valid tokens found
    
    # Compute the average of word vectors for the document
    word_vectors = [model.wv[word] for word in valid_tokens]
    document_vector = sum(word_vectors) / len(word_vectors)
    
    return document_vector

# Load, process and vectorize the data from csv
def preprocess_and_vectorize(file_path):
    """
    Load, clean, tokenize the text, and generate document vectors using Word2Vec.
    :param file_path: Path to the CSV file.
    :return: DataFrame with document vectors
    """
    # Step 1: Load the data
    data = load_data_from_file(file_path)
    
    # Step 2: Clean the text
    data['cleaned_text'] = data['text'].apply(clean_text)
    
    # Step 3: Tokenize and lemmatize the text
    data['tokens'] = data['cleaned_text'].apply(custom_tokenize)
    data['lemmatized_tokens'] = data['tokens'].apply(lemmatize_tokens)
    
    # Step 4: Train Word2Vec model on the lemmatized tokens
    model = train_word2Vec_model(data['lemmatized_tokens'])
    
    # Step 5: Generate document vectors using the lemmatized tokens
    data['document_vector'] = data['lemmatized_tokens'].apply(lambda tokens: get_document_vector(tokens, model))
    
    return data  



[nltk_data] Downloading package punkt to /Users/zat_km/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zat_km/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/zat_km/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
