In [None]:
%%bash
pip install ktrain

In [None]:
# The Below line is 80 characters long
# ------------------------------------------------------------------------------

In [None]:
# I/O capabilities
import os

# Object serialization
import pickle

# Arrays and DataFrames
import numpy as np
import pandas as pd

# Natural language pre-processing
import nltk
nltk.download('omw-1.4')

# Machine learning pre-processing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

# Machine learning models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Model evaluation metrics
from sklearn.metrics import classification_report, confusion_matrix

# Plotting capabilities
import matplotlib.pyplot as plt
import altair as alt

# Neural Networks
import tensorflow as tf
from tensorflow import keras

# Abstraction over TensorFlow transformers
import ktrain

# Neural Networks
import tensorflow as tf
from tensorflow import keras

# Type hints
from typing import Optional, List

In [None]:
def altair_dark():
    font = "IBM Plex Mono"
    primary_color = "#537eff"
    background_color = "#161618"
    font_color = "#FFFFFF"
    gray_color = "#FFFFFF"
    base_font_size = 18
    
    small_font_size = base_font_size * 0.8
    large_font_size = base_font_size * 1.25
    xlarge_font_size = base_font_size * 1.75
    
    # Adapted from http://tsitsul.in/blog/coloropt/
    qualitative_palette = [
        "#537eff", "#00cb85", "#eeeeee", "#00e3ff",
        "#e935a1", "#e1562c", "#efe645"
    ]
    
    sequential_palette = [background_color, "#FFFFFF"]
    
    diverging_palette = qualitative_palette
    
    config = {
        "config": {
            "arc": {"fill": primary_color},
            "area": {"fill": primary_color},
            "circle": {
                "fill": primary_color,
                "stroke": font_color,
                "strokeWidth": 0.5,
            },
            "bar": {
                "fill": primary_color,
                "stroke": font_color,
                "strokeWidth": 0.5
            },
            "line": {"stroke": primary_color},
            "path": {"stroke": primary_color},
            "point": {"stroke": primary_color},
            "rect": {
                "fill": primary_color,
                "stroke": font_color,
                "strokeWidth": 0.5
            },
            "shape": {"stroke": primary_color},
            "symbol": {"fill": primary_color},
            "title": {
                "font": font,
                "color": font_color,
                "fontSize": large_font_size,
                "anchor": "start",
            },
            "axis": {
                "titleFont": font,
                "titleColor": font_color,
                "titleFontSize": small_font_size,
                "labelFont": font,
                "labelColor": font_color,
                "labelFontSize": small_font_size,
                "gridColor": gray_color,
                "gridOpacity": 0.5,
                "domainColor": font_color,
                "tickColor": gray_color,
            },
            "header": {
                "labelFont": font,
                "titleFont": font,
                "color": font_color,
                "labelColor": font_color,
                "titleColor": font_color,
                "labelFontSize": base_font_size,
                "titleFontSize": base_font_size,
            },
            "legend": {
                "titleFont": font,
                "titleColor": font_color,
                "titleFontSize": small_font_size,
                "labelFont": font,
                "labelColor": font_color,
                "labelFontSize": small_font_size,
            },
            "range": {
                "category": qualitative_palette,
                "diverging": diverging_palette,
                "heatmap": sequential_palette,
                "ramp": sequential_palette,
                "ordinal": sequential_palette,
            },
            "background": background_color,
            "type": "fit"
        }
    }    
    return config

alt.themes.register("altair_dark", altair_dark)
alt.themes.enable("altair_dark")

# Data Ingestion

In [None]:
# Set global variables
DATA_DIR = "/kaggle/input/news-category-data"
DATA_FNAME = os.path.join(DATA_DIR, "news_category_training_data.json")

# Ingest dataset
DATA = pd.read_json(DATA_FNAME)

# Rename and extract necessary variables
data = DATA[["category", "headline", "short_description"]]
data.columns = ["Category", "Headline", "ShortDescription"]

# Contacenate the headline and description into a single variable
data["Text"] = data["Headline"] + " " + data["ShortDescription"]
data = data[["Category", "Text"]]

data["Category"] = data["Category"].map(str.capitalize)

In [None]:
display(data.head(5))

In [None]:
data_category_counts = pd.DataFrame(
    data["Category"]
    .value_counts()
    .reset_index()
)
data_category_counts.columns = ["Category", "Frequency"]

In [None]:
def keep_column(frequency, cutoff:int=5_000) -> str:
    if frequency < cutoff:
        return False
    
    return True


data_category_counts["Keep"] = (
    data_category_counts["Frequency"]
    .map(lambda x: keep_column(x, cutoff=5_000))
)

In [None]:
chart_category_counts = (
    alt.Chart(data_category_counts)
    .mark_bar()
    .encode(
        x=alt.X("Frequency:Q"),
        y=alt.Y("Category:N", sort="-x"),
        color=alt.Color("Keep:N", sort=[True, False]),
        tooltip=[
            alt.Tooltip("Category:N"),
            alt.Tooltip("Frequency:Q"),
            alt.Tooltip("Keep:N")
        ]
    )
    .properties(width=400)
)

display(chart_category_counts)
chart_category_counts.save("figure_category_counts.html")

In [None]:
categories = data_category_counts.query("Keep == True")["Category"].to_list()
data = data.query("Category in @categories")

# Data Pre-Processing
- Text Tokenization
- Word Lemmitization
- [Optional] Text Vectorization
- [Optional] Text TF-IDF Transformation

In [None]:
def get_stop_words() -> set:
    """Retrieve a set of stop words built on top of the standard 'english' stop
    words.
    """
    extended_words = []
    extended_words.extend(list("123456789"))
    extended_words.extend(list("!@#$%^&*()-_=+"))
    extended_words.extend(["?", ".", ",", "!", "''", "'", "``", "’", ":"])
    extended_words.extend(["'s", "'d", "'re"])
    extended_words.extend(["http://", "https://"])
    
    stop_words = nltk.corpus.stopwords.words("english")
    stop_words.extend(extended_words)
    
    return set(stop_words)


def tokenize_text(text:str, stop_words:Optional[set]=None) -> List[str]:
    """Tokenize text by standardizing case, removing special characters and
    separating words in a given string.
    """
    text_tokenized = nltk.tokenize.word_tokenize(
        text,
        language="english"
    )
    text_tokenized = [word.lower() for word in text_tokenized]
    
    if stop_words is not None:
        text_tokenized = [
            word for word in text_tokenized
            if word not in stop_words
        ]
        
    return text_tokenized

In [None]:
stop_words = get_stop_words()

for text in data["Text"][:5]:
    text_tokenized = tokenize_text(text, stop_words)
    
    print(f"Original:  {text}")
    print(f"Tokenized: {text_tokenized}\n")

In [None]:
stop_words = get_stop_words()
data["TextTokenized"] = data["Text"].map(lambda t: tokenize_text(t, stop_words))

In [None]:
display(data)

In [None]:
lemmatizer = nltk.stem.WordNetLemmatizer()
data["TextLemmatized"] = data["TextTokenized"].map(
    lambda words: [lemmatizer.lemmatize(word) for word in words]
)

In [None]:
display(data)

In [None]:
data.to_csv("/kaggle/working/data-preprocessed.csv", index=False)

In [None]:
# Split the data into training and testing sets
x = data["TextLemmatized"].to_list()

# Reverse-tokenize the data for sklearn (TfidfVectorizer will re-tokenize)
x = [" ".join(tokens) for tokens in x]

y = data["Category"].to_list()

x_train_text, x_test_text, y_train, y_test = train_test_split(
    x, y,
    train_size=0.9,
    random_state=112358
)

In [None]:
vectorizer = TfidfVectorizer(min_df=0.01, max_df=0.99)
x_train = vectorizer.fit_transform(x_train_text)
x_test = vectorizer.transform(x_test_text)

# Visualize Vectors using Truncated SVD

In [None]:
svd = TruncatedSVD(n_components=2, n_iter=100, random_state=112358)
x_train_reduced = svd.fit_transform(x_train)

In [None]:
# Extract mean and standard deviation of each training label
categories = set(data["Category"])
categories_data = {category: [] for category in categories}

for x_obs, y_obs in zip(x_train_reduced, y_train):
    categories_data[y_obs].append(x_obs)
    
categories_data = {
    category: np.array(observations)
    for category, observations in categories_data.items()
}

In [None]:
def pooled_std(x:np.array):
    """Compute the pooled standard deviation of a set of observations
    """
    pooled_std = np.sqrt(np.sum(x.var(axis=0)) / 2)
    return pooled_std

categories_qualia = {
    "Category": [],
    "Mean X": [],
    "Mean Y": [],
    "Standard Deviation": []
}

for category, observations in categories_data.items():
    mean = np.mean(observations, axis=0)
    std_pooled = pooled_std(observations)
    
    categories_qualia["Category"].append(category)
    categories_qualia["Mean X"].append(mean[0])
    categories_qualia["Mean Y"].append(mean[1])
    categories_qualia["Standard Deviation"].append(std_pooled)
    
categories_qualia = pd.DataFrame(categories_qualia)

In [None]:
display(categories_qualia.head(5))

In [None]:
chart_categories_svd = (
    alt.Chart(categories_qualia)
    .mark_circle(opacity=0.75)
    .encode(
        x=alt.X("Mean X:Q", title="PC1", scale=alt.Scale(domain=(-0.5, 0.5))),
        y=alt.Y("Mean Y:Q", title="PC2", scale=alt.Scale(domain=(-0.5, 0.5))),
        size=alt.Size("Standard Deviation:Q"),
        tooltip=alt.Tooltip(["Category"])
    )
    .properties(width=400, height=400)
).interactive()

display(chart_categories_svd)

In [None]:
chart_categories_svd.save("figure_svd.html")

# Model Building

In [None]:
def model_evaluation(y_true, y_pred, labels:Optional[set]=None) -> tuple:
    """Evaluate a model, returning the confusion matrix and a classification
    report.
    
    Args:
        y_true (iterable): Labels representing the groud truth
        y_pred (iterable): Labels predicted from a model
        labels (Optional[set]): Labels corresponding to predictions. If None, we
            take the intersection of the y_true and y_pred factors.
        
    Returns:
        confusion_matrix (np.ndarray): Multi-label Confusion matrix in the form
            of a NumPy array where a[i, j] represents the number of time class i
            predicted class j.
        classification_report (pd.DataFrame): A report of containing recall,
            precision, F1 score, and accuracy metrics for each class.
    """
    
    if labels is None:
        labels = list(set(y_pred) | set(y_test))
    
    c_matrix = confusion_matrix(y_true, y_pred, labels=labels, normalize="true")
    c_report = pd.DataFrame(classification_report(
        y_true,
        y_pred,
        labels=labels,
        output_dict=True
    ))
    
    c_report = c_report.drop(labels="accuracy", axis=1)
    c_report = c_report.drop(labels="support", axis=0)
    
    c_report.columns = [cname.capitalize() for cname in c_report.columns]
    c_report.index = [iname.capitalize() for iname in c_report.index]
    
        
    return (c_matrix, c_report, labels)

In [None]:
classifiers = {
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_jobs=-1),
    "Logistic Regression": LogisticRegression(
        multi_class="multinomial",
        max_iter=500,
        n_jobs=-1,
        warm_start=True
    )
}

metrics = {}
for name, classifier in classifiers.items():
    print(f"{name}...")
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)
    
    c_matrix, c_report, labels = model_evaluation(y_test, y_pred)
    metrics[name] = ({
        "c_matrix": c_matrix,
        "c_report": c_report,
        "labels": labels
    })

In [None]:
def confusion_matrix_chart(
        c_matrix:np.ndarray, labels:List[str], title:Optional[str]=None
    ) -> alt.Chart:
    if title is None:
        title = ""
        
    source = {label: c_matrix[:, i] for i, label in enumerate(labels)}
    source["Ground Truth"] = labels
    source = pd.DataFrame(source)
    
    chart = (
        alt.Chart(source)
        .transform_fold(labels)
        .mark_rect()
        .encode(
            x=alt.X("Ground Truth:N"),
            y=alt.Y("key:N", title="Predicted"),
            color=alt.Color(
                "value:Q",
                title=["Relative", "Frequency"],
                scale=alt.Scale(domain=(0, 1))
            ),
            tooltip=[
                alt.Tooltip("Ground Truth:N"),
                alt.Tooltip("key:N", title="Predicted"),
                alt.Tooltip("value:Q", title="Relative Frequency")
            ],
        )
        .properties(width=300, height=300, title=title)
    )
    
    return chart


def classification_report_charts(
        c_reports:List[pd.DataFrame], model_names:List[str], width:int=400
    ) -> alt.Chart:
    source = pd.DataFrame()
    for c_report, model_name in zip(c_reports, model_names):
        c_report = c_report.reset_index()
        c_report = c_report.melt("index")
        c_report["Model"] = model_name
        
        source = pd.concat((source, c_report))

    chart = (
        alt.Chart(source)
        .mark_rect()
        .encode(
            x=alt.X("index:N", title=""),
            y=alt.Y("variable:N", title="Category"),
            color=alt.Color(
                "value:Q",
                title="Score",
                scale=alt.Scale(domain=(0, 1))
            ),
            tooltip=[
                alt.Tooltip("Model:N", title="Model"),
                alt.Tooltip("index:N", title="Metric"),
                alt.Tooltip("value:Q", title="Score")
            ],
            column=alt.Column(
                "Model:N",
                spacing=20,
                header=alt.Header(labelFontSize=12)
            )
        )
        .properties(width=width / len(model_names), height=400)
    )

    return chart

In [None]:
model_names = list(metrics.keys())
c_matrices = [metrics[name]["c_matrix"] for name in model_names]
c_reports = [metrics[name]["c_report"] for name in model_names]
labels = [metrics[name]["labels"] for name in model_names]

In [None]:
c_matrix_charts = [
    confusion_matrix_chart(c_matrix, labels, title)
    for c_matrix, labels, title in zip(c_matrices, labels, model_names)
]

ml_c_matrix_chart = alt.hconcat(*c_matrix_charts[0:2])
ml_c_matrix_chart = alt.vconcat(
    ml_c_matrix_chart,
    c_matrix_charts[2],
    center=True
).configure_title(fontSize=16, anchor="middle")

display(ml_c_matrix_chart)
ml_c_matrix_chart.save("figure_ml_cmatrix.html")

In [None]:
c_report_chart = classification_report_charts(c_reports, model_names)

display(c_report_chart)
c_report_chart.save("figure_ml_creport.html")

In [None]:
data_train, data_test, preprocessor = ktrain.text.texts_from_df(
    train_df=data,
    text_column="Text",
    label_columns=["Category"],
    max_features=10_000,
    maxlen=256,
    val_pct=0.1,
    ngram_range=1,
    preprocess_mode="distilbert",
    verbose=1
)

In [None]:
model = preprocessor.get_classifier()
learner = ktrain.get_learner(
    model,
    train_data=data_train,
    val_data=data_test,
    batch_size=16
)

In [None]:
learner.lr_find(max_epochs=5)
learner.lr_plot()

In [None]:
history = learner.autofit(
    lr=1e-4,
    epochs=10,
    early_stopping=2,
    reduce_on_plateau=1,
    reduce_factor=2.5
)

In [None]:
dl_history = pd.DataFrame({
    metric: history.history[metric]
    for metric in ["loss", "val_loss"
]})

dl_history.columns = ["Loss", "Validation Loss"]
dl_history["Epoch"] = range(1, len(dl_history) + 1)
dl_history = dl_history.melt("Epoch")

In [None]:
chart_dl_loss = (
    alt.Chart(dl_history)
    .mark_line(point={"filled": False, "fill": "#141416", "size": 40})
    .encode(
        x=alt.X(
            "Epoch:Q",
            axis=alt.Axis(values=list(range(1, len(dl_history)+1)))
        ),
        y=alt.Y("value:Q", title="Score"),
        color=alt.Color("variable:N", title="Metric"),
        tooltip=[
            alt.Tooltip("variable:N", title="Metric"),
            alt.Tooltip("value:Q", title="Score")
        ]
    )
    .properties(width=350, height=200)
    .configure_axisX(grid=False)
)

display(chart_dl_loss)
chart_dl_loss.save("figure_dl_loss.html")

In [None]:
labels = preprocessor.get_classes()
labels_mapper = {i: label for i, label in enumerate(labels)}

y_pred = learner.predict(data_test)
y_pred_indices = np.argmax(y_pred, axis=1)
y_pred_labels = [labels_mapper[ind] for ind in y_pred_indices]

In [None]:
y_test = []
for i, (x_obs, y_obs) in enumerate(data_test):
    y_test_indices = np.argmax(y_obs, axis=1)
    y_test_labels = [labels_mapper[ind] for ind in y_test_indices]
    y_test.extend(y_test_labels)

In [None]:
c_matrix, c_report, labels = model_evaluation(y_test, y_pred_labels)

In [None]:
chart_c_matrix = confusion_matrix_chart(
    c_matrix,
    labels=labels,
    title="DistilBERT"
)

chart_c_report = classification_report_charts(
    [c_report],
    model_names=["DistilBERT"],
    width=100
)


chart_dl_model = (
    alt.hconcat(chart_c_matrix, chart_c_report)
    .resolve_scale(color="independent")
    .configure_title(fontSize=16, anchor="middle")
)

display(chart_dl_model)

chart_dl_model.save("figure_dl_charts.html")

# Model Saving and Deployment

In [None]:
# Save predictor to a file
predictor = ktrain.get_predictor(learner.model, preprocessor)
predictor.save("distilbert")

In [None]:
# Load model and predictor in case we need to restart the kernel
with open("distilbert/tf_model.preproc", "rb") as preproc:
    preprocessor = pickle.load(preproc)
    
predictor = ktrain.load_predictor("distilbert")

In [None]:
# Example predictions from an expected input stream
x_stream = [
    "It is important to always live a healthy life with exercise and a good diet!",
    "100 Best knock-knock jokes this year! Which one is your favorite?",
    "Trump trumps his past Trump by hiring Trump to play Trump.",
    "Italy? Greece? Bangladesh? What countries should and shouldn't you visit?",
    "Learn how to teach your kids to be better kids with this 1 simple trick",
]

In [None]:
# We can predict directly from the ktrain predictor...
y_pred = predictor.predict(x_stream)
print(y_pred)

In [None]:
# Or, we can can feed it into the model and preprocess manually
labels = predictor.get_classes()
labels_mapper = {i: label for i, label in enumerate(labels)}

x_features = preprocessor.preprocess(x_stream)
y_pred_probabilities = predictor.model.predict(x_features)[0]
y_pred_indices = np.argmax(y_pred_probabilities, axis=1)
y_pred_labels = [labels_mapper[index] for index in y_pred_indices]

print(y_pred_labels)

In [None]:
%%bash
# Download distilbert model
zip -r distilbert /kaggle/working/distilbert

In [None]:
from IPython.display import FileLink
FileLink(r"distilbert.zip")