In [None]:
LLM_exercise.

In [None]:
# Basic Vocab

# Observations/Instances/Samples
#   In machine learning, an observation refers to a single instance of data in a dataset.
#   Each observation consists of several features and, in supervised learning, a target label.
#   A sample is a subset of observations/instances in a data set.
# From: https://deepchecks.com/glossary/observation-ml/

# Target Label/Variable/Class
#   A target is a dataset variable to be predicted by an ML model.
#   This is the variable that describes the outcome of the process.
#   Broadly speaking, the terms label, class, and target may be used interchangeably.
# From https://toloka.ai/blog/machine-learning-labels-and-features/

# Supervised vs Unsupervised Learning:
#   This notebook is an exercise in supervised learning to leverage an LLM to
#   determine if a sentence contains gender bias.
# Further reading: https://www.ibm.com/think/topics/supervised-vs-unsupervised-learning

# Paper discussing types of gender bias in language: https://arxiv.org/pdf/2201.08675
# Datasets originate from:
# - locally deployed LLama 3 LLM (~554)
# - https://www.kaggle.com/datasets/dgrosz/sexist-workplace-statements (1138)

# Pre-processing ideas:
# https://arxiv.org/pdf/2111.03612
# - Hyphens and hashtags swap for whitespace
# - Swap all usernames with string 'username'
# - Lowercase it all
# - Purge punctuation

In [3]:
# Here, we import required libraries and our raw training data.

# Numerical analysis
import numpy as np

# Data manipulation
import pandas as pd
import string

# Data visualization
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Hugging Face
from datasets import load_from_disk

# Open 'aggregate_data.tsv' to inspect the data set.
# The columns should be self-explanatory, with the exception of 'label':
# - 0 indicates a lack of societal gender bias.
# - 1 indicates a presence of societal gender bias.

# Import training data. This is for data visualization p
# "df" is short here for "data frame" (https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html).
df = pd.read_csv("aggregate_data.tsv", sep="\t")
dataset = load_from_disk("./dataset")

  from .autonotebook import tqdm as notebook_tqdm
