In [22]:
import os
import glob

In [23]:
def read_imdb_data(data_dir='../data/'):
    data = {}
    labels = {}
    
    for data_type in ['train', 'test']:
        data[data_type] = {}
        labels[data_type] = {}
        
        for sentiment in ['pos', 'neg']:
            data[data_type][sentiment] = []
            labels[data_type][sentiment] = []
            
            path = os.path.join(data_dir, data_type, sentiment, '*.txt')
            files = glob.glob(path)
            
            for f in files:
                with open(f) as review:
                    data[data_type][sentiment].append(review.read())
                    # Here we represent a positive review by '1' and a negative review by '0'
                    labels[data_type][sentiment].append(1 if sentiment == 'pos' else 0)
                    
            assert len(data[data_type][sentiment]) == len(labels[data_type][sentiment]), \
                    "{}/{} data size does not match labels size".format(data_type, sentiment)
                
    return data, labels

In [24]:
data, labels = read_imdb_data()
print("IMDB reviews: train = {} pos / {} neg, test = {} pos / {} neg".format(
            len(data['train']['pos']), len(data['train']['neg']),
            len(data['test']['pos']), len(data['test']['neg'])))

IMDB reviews: train = 12500 pos / 12500 neg, test = 12500 pos / 12500 neg


In [25]:
from sklearn.utils import shuffle

def prepare_imdb_data(data, labels):
    """Prepare training and test sets from IMDb movie reviews."""
    
    #Combine positive and negative reviews and labels
    data_train = data['train']['pos'] + data['train']['neg']
    data_test = data['test']['pos'] + data['test']['neg']
    labels_train = labels['train']['pos'] + labels['train']['neg']
    labels_test = labels['test']['pos'] + labels['test']['neg']
    
    #Shuffle reviews and corresponding labels within training and test sets
    data_train, labels_train = shuffle(data_train, labels_train)
    data_test, labels_test = shuffle(data_test, labels_test)
    
    # Return a unified training data, test data, training labels, test labets
    return data_train, data_test, labels_train, labels_test

In [26]:
train_X, test_X, train_y, test_y = prepare_imdb_data(data, labels)
print("IMDb reviews (combined): train = {}, test = {}".format(len(train_X), len(test_X)))

IMDb reviews (combined): train = 25000, test = 25000


In [27]:
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem.porter import *
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/andrevargas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [28]:
import re
from bs4 import BeautifulSoup

def review_to_words(review):
    text = BeautifulSoup(review, "html.parser").get_text() # Remove HTML tags
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower()) # Convert to lower case
    words = text.split() # Split string into words
    words = [w for w in words if w not in stopwords.words("english")] # Remove stopwords
    words = [PorterStemmer().stem(w) for w in words] # stem
    
    return words

In [29]:
import pickle

cache_dir = os.path.join("../cache", "sentiment_analysis")  # where to store cache files
os.makedirs(cache_dir, exist_ok=True)  # ensure cache directory exists

def preprocess_data(data_train, data_test, labels_train, labels_test,
                    cache_dir=cache_dir, cache_file="preprocessed_data.pkl"):
    """Convert each review to words; read from cache if available."""

    # If cache_file is not None, try to read from it first
    cache_data = None
    if cache_file is not None:
        try:
            with open(os.path.join(cache_dir, cache_file), "rb") as f:
                cache_data = pickle.load(f)
            print("Read preprocessed data from cache file:", cache_file)
        except:
            pass  # unable to read from cache, but that's okay
    
    # If cache is missing, then do the heavy lifting
    if cache_data is None:
        # Preprocess training and test data to obtain words for each review
        #words_train = list(map(review_to_words, data_train))
        #words_test = list(map(review_to_words, data_test))
        words_train = [review_to_words(review) for review in data_train]
        words_test = [review_to_words(review) for review in data_test]
        
        # Write to cache file for future runs
        if cache_file is not None:
            cache_data = dict(words_train=words_train, words_test=words_test,
                              labels_train=labels_train, labels_test=labels_test)
            with open(os.path.join(cache_dir, cache_file), "wb") as f:
                pickle.dump(cache_data, f)
            print("Wrote preprocessed data to cache file:", cache_file)
    else:
        # Unpack data loaded from cache file
        words_train, words_test, labels_train, labels_test = (cache_data['words_train'],
                cache_data['words_test'], cache_data['labels_train'], cache_data['labels_test'])
    
    return words_train, words_test, labels_train, labels_test

In [30]:
# Preprocess data
train_X, test_X, train_y, test_y = preprocess_data(train_X, test_X, train_y, test_y)

Read preprocessed data from cache file: preprocessed_data.pkl


In [31]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import joblib
# joblib is an enhanced version of pickle that is more efficient for storing NumPy arrays

def extract_BoW_features(words_train, words_test, vocabulary_size=5000,
                         cache_dir=cache_dir, cache_file="bow_features.pkl"):
    """Extract Bag-of-Words for a given set of documents, already preprocessed into words."""
    
    # If cache_file is not None, try to read from it first
    cache_data = None
    if cache_file is not None:
        try:
            with open(os.path.join(cache_dir, cache_file), "rb") as f:
                cache_data = joblib.load(f)
            print("Read features from cache file:", cache_file)
        except:
            pass  # unable to read from cache, but that's okay
    
    # If cache is missing, then do the heavy lifting
    if cache_data is None:
        # Fit a vectorizer to training documents and use it to transform them
        # NOTE: Training documents have already been preprocessed and tokenized into words;
        #       pass in dummy functions to skip those steps, e.g. preprocessor=lambda x: x
        vectorizer = CountVectorizer(max_features=vocabulary_size,
                preprocessor=lambda x: x, tokenizer=lambda x: x)  # already preprocessed
        features_train = vectorizer.fit_transform(words_train).toarray()

        # Apply the same vectorizer to transform the test documents (ignore unknown words)
        features_test = vectorizer.transform(words_test).toarray()
        
        # NOTE: Remember to convert the features using .toarray() for a compact representation
        
        # Write to cache file for future runs (store vocabulary as well)
        if cache_file is not None:
            vocabulary = vectorizer.vocabulary_
            cache_data = dict(features_train=features_train, features_test=features_test,
                             vocabulary=vocabulary)
            with open(os.path.join(cache_dir, cache_file), "wb") as f:
                joblib.dump(cache_data, f)
            print("Wrote features to cache file:", cache_file)
    else:
        # Unpack data loaded from cache file
        features_train, features_test, vocabulary = (cache_data['features_train'],
                cache_data['features_test'], cache_data['vocabulary'])
    
    # Return both the extracted features as well as the vocabulary
    return features_train, features_test, vocabulary

In [32]:
# Extract Bag of Words features for both training and test datasets
train_X, test_X, vocabulary = extract_BoW_features(train_X, test_X)

Read features from cache file: bow_features.pkl


In [33]:
train_X

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [34]:
import pandas as pd

# TODO: Split the train_X and train_y arrays into the DataFrames val_X, train_X and val_y, train_y. Make sure that
#       val_X and val_y contain 10 000 entires while train_X and train_y contain the remaining 15 000 entries.
#val_X = pd.DataFrame(None)
#train_X = pd.DataFrame(None)

#val_y = pd.DataFrame(None)
#train_y = pd.DataFrame(None)

# Solution:
# Earlier we shuffled the training dataset so to make things simple we can just assign
# the first 10 000 reviews to the validation set and use the remaining reviews for training.
val_X = pd.DataFrame(train_X[:10000])
train_X = pd.DataFrame(train_X[10000:])

val_y = pd.DataFrame(train_y[:10000])
train_y = pd.DataFrame(train_y[10000:])

In [35]:
# First we make sure that the local directory in which we'd like to store the training and validation csv files exists.
data_dir = '../data/xgboost'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

In [36]:
# First, save the test data to test.csv in the data_dir directory. Note that we do not save the associated ground truth
# labels, instead we will use them later to compare with our model output.

pd.DataFrame(test_X).to_csv(os.path.join(data_dir, 'test.csv'), header=False, index=False)

# TODO: Save the training and validation data to train.csv and validation.csv in the data_dir directory.
#       Make sure that the files you create are in the correct format.

# Solution:
pd.concat([val_y, val_X], axis=1).to_csv(os.path.join(data_dir, 'validation.csv'), header=False, index=False)
pd.concat([train_y, train_X], axis=1).to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)

In [44]:
import xgboost as xgb

max_depth=5
eta=0.2
gamma=4
min_child_weight=6
subsample=0.8
silent=0
objective='binary:logistic'
early_stopping_rounds=10
num_round=500
    
xgb_model = xgb.XGBClassifier(
    max_depth=max_depth,
    eta=eta,
    gamma=gamma,
    min_child_weight=min_child_weight,
    subsample=subsample,
    silent=silent,
    objective=objective,
    early_stopping_rounds=early_stopping_rounds,
    num_round=num_round
)

xgb_model.fit(train_X, train_y, eval_set=[(val_X, val_y)])

Parameters: { "num_round", "silent" } are not used.



[0]	validation_0-logloss:0.65425
[1]	validation_0-logloss:0.62745
[2]	validation_0-logloss:0.60769
[3]	validation_0-logloss:0.59146
[4]	validation_0-logloss:0.57853
[5]	validation_0-logloss:0.56575
[6]	validation_0-logloss:0.55487
[7]	validation_0-logloss:0.54567
[8]	validation_0-logloss:0.53736
[9]	validation_0-logloss:0.52982
[10]	validation_0-logloss:0.52261
[11]	validation_0-logloss:0.51560
[12]	validation_0-logloss:0.50968
[13]	validation_0-logloss:0.50351
[14]	validation_0-logloss:0.49837
[15]	validation_0-logloss:0.49354
[16]	validation_0-logloss:0.48916
[17]	validation_0-logloss:0.48439
[18]	validation_0-logloss:0.48040
[19]	validation_0-logloss:0.47692
[20]	validation_0-logloss:0.47317
[21]	validation_0-logloss:0.46986
[22]	validation_0-logloss:0.46649
[23]	validation_0-logloss:0.46267
[24]	validation_0-logloss:0.45945
[25]	validation_0-logloss:0.45627
[26]	validation_0-logloss:0.45252
[27]	validation_0-logloss:0.44958
[28]	validation_0-logloss:0.44664
[29]	validation_0-loglos

In [45]:
# Access the evaluation results
eval_results = xgb_model.evals_result()

# Extract validation metrics
validation_metrics = eval_results['validation_0']

# Print the evaluation metrics
print("Validation Metrics:")
for metric_name, metric_values in validation_metrics.items():
    print(f"{metric_name}: {metric_values[-1]}")

Validation Metrics:
logloss: 0.36020143144534666


In [47]:
# Make predictions
predictions = xgb_model.predict(test_X)

In [49]:
import mlflow
from mlflow.models import infer_signature
import mlflow.xgboost
from datetime import datetime

TRACKING_URI = "https://mlflow-server-wno7iop4fa-uc.a.run.app/"

mlflow.set_tracking_uri(TRACKING_URI)

# Create an experiment if it doesn't exist
experiment_name = "Sentiment_Analysis_Website"
if not mlflow.get_experiment_by_name(name=experiment_name):
    mlflow.create_experiment(
        name=experiment_name
    )
experiment = mlflow.get_experiment_by_name(experiment_name)

# Define the run name and tags for the experiment
run_name = datetime.now().strftime("%Y-%m-%d_%H:%M")
tags = {
    "env": "test",
    "data_date": "2024-03-05",
    "model_type": "XGBoost",
    "experiment_description": "Sentiment Analysis Model"
    # ... other tags ...
}

# Start the MLflow run
with mlflow.start_run(
    experiment_id=experiment.experiment_id, 
    run_name=run_name, 
    tags=tags
):
    
    # Log the hyperparameters used in the model
    mlflow.log_param("max_depth", max_depth)
    mlflow.log_param("eta", eta)
    mlflow.log_param("gamma", gamma)
    mlflow.log_param("min_child_weight", min_child_weight)
    mlflow.log_param("subsample", subsample)
    mlflow.log_param("silent", silent)
    mlflow.log_param("objective", objective)
    mlflow.log_param("early_stopping_rounds", early_stopping_rounds)
    mlflow.log_param("num_round", num_round)

    # Log the metrics
    mlflow.log_metric("loss", validation_metrics[list(validation_metrics.keys())[0]][-1])
    
    # Log model:
    signature = infer_signature(train_X, predictions)    
    mlflow.xgboost.log_model(xgb_model, "xgb_model")

