v1.0

- The objective of this **03.03** notebook is to perform the crossvaldiation for MultinomialNB and TfidfVectorizer.

In [None]:
import time
t_start = time.time()

In [None]:
#!pip install icecream
#!pip install watermark
#!pip install fastparquet

# Setup Environment

In [1]:
import os
import string
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score

# Mount Google Drive and set the path to the root directory of the project
from google.colab import drive

drive.mount('/content/drive')
try:
    PATH_ROOT = "/content/drive/MyDrive/MADS/SIADS696/Environment/"
except:
    print("Using shortcut location to load data.")
    PATH_ROOT = "/content/drive/MyDrive/SIADS696/Environment/"

# Define the paths to different directories and files in the project
PATHS = {
    "data": os.path.join(PATH_ROOT, "data"),
    "data_raw": os.path.join(PATH_ROOT, "data", "raw"),
    "data_int": os.path.join(PATH_ROOT, "data", "interim"),
    "data_pro": os.path.join(PATH_ROOT, "data", "processed"),
    "models": os.path.join(PATH_ROOT, "models"),
    "reports": os.path.join(PATH_ROOT, "reports"),
    "figures": os.path.join(PATH_ROOT, "reports", "figures"),
}

# load each dataset and store it in a variable with the same name as its key in the DATASETS dictionary
locals()[f"df_training"] = pd.read_csv(PATHS["data_raw"] +"/" + "WikiLarge_Train.csv")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Load Data and perform Crossvalidation

In [None]:
# Split the data into text and labels
text_data = df_training["original_text"]
labels = df_training["label"]

# Define a function to clean the text data
def clean(doc):
    # Remove punctuation and digits from the text
    doc = "".join([char for char in doc if char not in string.punctuation and not char.isdigit()])
    return doc

# Define the pipeline
pipeline = Pipeline([
    # Use TfidfVectorizer to transform the text data into TF-IDF features
    ('tfidf', TfidfVectorizer(ngram_range=(1,15), preprocessor=clean, binary=True, max_df=0.9)),
    # Use MultinomialNB to make predictions based on the TF-IDF features
    ('nb', MultinomialNB(alpha=0.9, fit_prior=False))
])

# Perform 10-fold cross-validation on the pipeline
scores = cross_val_score(pipeline, text_data, labels, cv=10)

# Compute the mean cross-validation score and print it
mean_score = sum(scores) / len(scores)
print("Mean cross-validation score:", mean_score)

# Watermark

In [None]:
% watermark

In [None]:
% watermark --iversions

In [None]:
t_end = time.time()
total_runtime = t_end - t_start
total_runtime_min = round((total_runtime / 60), 2)
print(str(total_runtime_min) + " minutes")