In [10]:
import numpy as np
from typing import Dict, Any, List, Tuple
import pandas as pd
import numpy as np
import os
import csv
config = {
    # Data configuration
    "data": {
        "train_test_split_ratio": 0.8,
        "random_state": 42
    },
    
    # Text preprocessing configuration
    "preprocessing": {
        "remove_stopwords": True,
        "lemmatize": True,
        "min_word_length": 3
    },
    
    # Word2Vec configuration
    "word2vec": {
        "vector_size": 100,
        "window": 5,
        "min_count": 1,
        "workers": 4
    },
    
    # Model training configuration
    "model": {
        "classifier": "RandomForest",
        "params": {
            "n_estimators": 100,
            "max_depth": 10,
            "random_state": 42
        }
    }
}

In [12]:
def clean_data( file_path = '../data/multimodal/raw/COMP5329S1A2Dataset/train.csv') -> pd.DataFrame:
    """Clean raw CSV data and handle malformed lines"""
    correct_lines = []
    problematic_lines = []

    # Read and process raw file
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.reader(file, quotechar='"', delimiter=',', 
                          doublequote=True, skipinitialspace=True)
        
        # Process header
        header = next(reader)
        correct_lines.append(header)
        
        # Process rows
        for line_number, fields in enumerate(reader, start=2):
            if len(fields) == 3:
                correct_lines.append(fields)
            else:
                print(f"Problematic line {line_number}: {fields}")
                problematic_lines.append(fields)

    # Fix problematic lines
    for fields in problematic_lines:
        if len(fields) > 3:
            # Merge extra columns into Caption
            fields = [fields[0], fields[1], ','.join(fields[2:])]
        elif len(fields) < 3:
            # Pad missing columns
            fields += [''] * (3 - len(fields))
        correct_lines.append(fields)

    # Create DataFrame with proper types
    df = pd.DataFrame(correct_lines[1:], columns=header)
    
    return df, df["Labels"].tolist(), df["Caption"].tolist()

In [13]:
df, y, X = clean_data( file_path = '../data/multimodal/raw/COMP5329S1A2Dataset/train.csv')

Problematic line 9086: ['9084.jpg', '3 1 11', 'A street sign labeled Seltzer Way', ' with a red fire hydrant in the foreground and a street stretching into the background.']
Problematic line 9510: ['9508.jpg', '1', 'A cow in street with writing that reads oh no', 'not beef on the menu again!""']
Problematic line 18114: ['18112.jpg', '1', 'A small hand is forming thethumbs up', ' signal.']
Problematic line 27169: ['27167.jpg', '10', 'A street sign that says Sex St', ' along with a sign saying there is a $350 penalty is you honk.']


In [19]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
def preprocess_text(
    texts: List[str],
    config: Dict[str, Any]
) -> Tuple[List[List[str]], List[Any]]:
    """
    Preprocess text data by performing tokenization, stopwords removal, and lemmatization.
    
    Args:
        texts: List of raw text strings to be processed
        config: Configuration dictionary with preprocessing parameters
        
    Returns:
        processed_texts: List of lists where each inner list contains preprocessed tokens
    """
    # Initialize the lemmatizer
    lemmatizer = WordNetLemmatizer()
    
    # Get English stopwords
    stop_words = set(stopwords.words('english'))
    
    # Get configuration parameters
    remove_stopwords = config['preprocessing']['remove_stopwords']
    lemmatize = config['preprocessing']['lemmatize']
    min_word_length = config['preprocessing']['min_word_length']
    
    processed_texts = []
    
    for text in texts:
        # Tokenize the text
        tokens = word_tokenize(text.lower())
        
        # Filter tokens based on configuration
        filtered_tokens = []
        for token in tokens:
            # Skip short words
            if len(token) < min_word_length:
                continue
                
            # Skip stopwords if configured
            if remove_stopwords and token in stop_words:
                continue
                
            # Apply lemmatization if configured
            if lemmatize:
                token = lemmatizer.lemmatize(token)
                
            filtered_tokens.append(token)
            
        processed_texts.append(filtered_tokens)
    
    return processed_texts

In [20]:
processed_texts = preprocess_text(X,config)

In [29]:
from gensim.models import Word2Vec
from sklearn.preprocessing import MultiLabelBinarizer
def encode_with_word2vec(
    processed_texts: List[List[str]],
    labels: List[Any],
    config: Dict[str, Any]
) -> Tuple[np.ndarray, List[Any], Any]:
    """
    Encode preprocessed text tokens using Word2Vec model.
    
    Args:
        processed_texts: List of lists where each inner list contains preprocessed tokens
        labels: List of corresponding labels for the texts
        config: Configuration dictionary with Word2Vec parameters
        
    Returns:
        text_vectors: NumPy array of text vectors
        labels: The original labels passed through
        model: Trained Word2Vec model
    """
    # Get Word2Vec configuration parameters
    vector_size = config['word2vec']['vector_size']
    window = config['word2vec']['window']
    min_count = config['word2vec']['min_count']
    workers = config['word2vec']['workers']
    
    # Train Word2Vec model on the preprocessed texts
    model = Word2Vec(
        sentences=processed_texts,
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        workers=workers
    )
    
    # Create document vectors by averaging word vectors for each document
    text_vectors = []
    for tokens in processed_texts:
        # Filter tokens that are in the model's vocabulary
        valid_tokens = [token for token in tokens if token in model.wv]
        
        if valid_tokens:
            # Calculate the average vector for all valid tokens
            doc_vector = np.mean([model.wv[token] for token in valid_tokens], axis=0)
        else:
            # If no valid tokens, use a zero vector
            doc_vector = np.zeros(vector_size)
            
        text_vectors.append(doc_vector)
    
    # Convert to numpy array
    text_vectors = np.array(text_vectors)

    mlb = MultiLabelBinarizer()
    encoded_labels = mlb.fit_transform(labels)
    
    return text_vectors, encoded_labels, model


In [32]:
text_vectors, encoded_labels, model = encode_with_word2vec(
    processed_texts,y, config
)

[1;35mcollecting all words and their counts[0m
[1;35mPROGRESS: at sentence #0, processed 0 words, keeping 0 word types[0m
[1;35mPROGRESS: at sentence #10000, processed 58212 words, keeping 3655 word types[0m
[1;35mPROGRESS: at sentence #20000, processed 116173 words, keeping 4984 word types[0m
[1;35mcollected 6007 word types from a corpus of 174596 raw words and 30000 sentences[0m
[1;35mCreating a fresh vocabulary[0m
[1;35mWord2Vec lifecycle event {'msg': 'effective_min_count=1 retains 6007 unique words (100.00% of original 6007, drops 0)', 'datetime': '2025-04-02T11:39:41.001030', 'gensim': '4.3.3', 'python': '3.11.5 | packaged by Anaconda, Inc. | (main, Sep 11 2023, 13:26:23) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'prepare_vocab'}[0m
[1;35mWord2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 174596 word corpus (100.00% of original 174596, drops 0)', 'datetime': '2025-04-02T11:39:41.002030', 'gensim': '4.3.3', 'python':

In [34]:
encoded_labels.shape

(30000, 11)

In [35]:
text_vectors

array([[ 0.00739787,  0.22850467, -0.15060474, ..., -0.34482223,
         0.1946074 , -0.11335909],
       [-0.31826395,  0.13761549,  0.23117161, ..., -0.47330028,
         0.07190013, -0.8259868 ],
       [-0.05473959,  0.15083836,  0.14118056, ..., -0.26110846,
         0.14016667, -0.47439378],
       ...,
       [-0.05223429,  0.2943649 ,  0.12121125, ..., -0.17811461,
         0.1443304 , -0.30702555],
       [ 0.00570911,  0.205766  , -0.06081741, ..., -0.2126519 ,
         0.1842104 , -0.04316916],
       [-0.09749375,  0.3797511 ,  0.10232753, ..., -0.08841111,
         0.16017438, -0.42716545]], dtype=float32)

In [36]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
def train_model(
    text_vectors: np.ndarray,
    labels: List[Any],
    config: Dict[str, Any]
) -> Tuple[Any, float, str]:
    """
    Train a classification model on the Word2Vec encoded text vectors.
    
    Args:
        text_vectors: NumPy array of text vectors
        labels: List of corresponding labels for the texts
        config: Configuration dictionary with model parameters
        
    Returns:
        model: Trained classification model
        accuracy: Model accuracy on test set
        report: Classification report as string
    """
    # Get model configuration parameters
    classifier_type = config['model']['classifier']
    model_params = config['model']['params']
    train_test_ratio = config['data']['train_test_split_ratio']
    random_state = config['data']['random_state']
    
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        text_vectors, 
        labels, 
        train_size=train_test_ratio,
        random_state=random_state
    )
    
    # Initialize the classifier based on configuration
    if classifier_type == 'RandomForest':
        model = RandomForestClassifier(**model_params)
    elif classifier_type == 'LogisticRegression':
        model = LogisticRegression(**model_params)
    elif classifier_type == 'SVM':
        model = SVC(**model_params)
    else:
        raise ValueError(f"Unsupported classifier type: {classifier_type}")
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Evaluate the model
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    return model, accuracy, report

In [37]:
model, accuracy, report = train_model(
    text_vectors,encoded_labels,config
) 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [40]:
report

'              precision    recall  f1-score   support\n\n           0       0.72      0.54      0.62      2114\n           1       0.76      0.22      0.34       306\n           2       0.94      0.99      0.96      5491\n           3       0.00      0.00      0.00       210\n           4       0.69      0.35      0.46       930\n           5       0.85      0.30      0.45       277\n           6       0.99      0.38      0.55       648\n           7       0.96      0.36      0.52       492\n           8       0.96      0.70      0.81       517\n           9       0.93      0.14      0.24       744\n          10       0.95      0.36      0.53       429\n\n   micro avg       0.89      0.66      0.76     12158\n   macro avg       0.80      0.39      0.50     12158\nweighted avg       0.86      0.66      0.71     12158\n samples avg       0.93      0.79      0.81     12158\n'