### __Exploration__

In [1]:
import re
import numpy as np
import pandas as pd
import tqdm
from datasets import load_dataset

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
from sklearn.linear_model import SGDClassifier

  from .autonotebook import tqdm as notebook_tqdm


### __Preprocessing__

In [2]:
dataset = load_dataset("AiresPucrs/stanford-encyclopedia-philosophy", split = 'train')

In [3]:
df = dataset.to_pandas()

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 182531 entries, 0 to 182530
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   metadata  182531 non-null  object
 1   text      182531 non-null  object
 2   category  182531 non-null  object
dtypes: object(3)
memory usage: 4.2+ MB


In [5]:
df.drop(columns=['metadata'], inplace=True)

In [6]:
# Download NLTK resources if you haven't already
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

# Create a set of English stopwords
stop_words = set(stopwords.words('english'))

# Initialize lemmatizer and stemmer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Define a function to clean and tokenize the text
def preprocess_tokenize(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation using regex
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize the cleaned text
    tokens = word_tokenize(text)
    return tokens

# Function to remove stopwords from a list of tokens
def remove_stopwords(tokens):
    return [token for token in tokens if token not in stop_words]

# Assume your DataFrame is already loaded as df and has a column named 'text'
# Create a new column for tokenized text
df['tokenized_text'] = df['text'].apply(preprocess_tokenize)
# Remove stopwords from the tokenized text
df['tokenized_text'] = df['tokenized_text'].apply(remove_stopwords)

# Create a new column for lemmatized text
df['lemmatized_text'] = df['tokenized_text'].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])

# Create a new column for stemmed text
df['stemmed_text'] = df['tokenized_text'].apply(lambda tokens: [stemmer.stem(token) for token in tokens])

In [7]:
df

Unnamed: 0,text,category,tokenized_text,lemmatized_text,stemmed_text
0,"In the philosophical literature, the term “ab...",abduction,"[philosophical, literature, term, abduction, u...","[philosophical, literature, term, abduction, u...","[philosoph, literatur, term, abduct, use, two,..."
1,This entry is exclusively concerned with abdu...,abduction,"[entry, exclusively, concerned, abduction, mod...","[entry, exclusively, concerned, abduction, mod...","[entri, exclus, concern, abduct, modern, sens,..."
2,"See also the entry on scientific discovery, ...",abduction,"[see, also, entry, scientific, discovery, part...","[see, also, entry, scientific, discovery, part...","[see, also, entri, scientif, discoveri, partic..."
3,Most philosophers agree that abduction (in th...,abduction,"[philosophers, agree, abduction, sense, infere...","[philosopher, agree, abduction, sense, inferen...","[philosoph, agre, abduct, sens, infer, best, e..."
4,You happen to know that Tim and Harry have re...,abduction,"[happen, know, tim, harry, recently, terrible,...","[happen, know, tim, harry, recently, terrible,...","[happen, know, tim, harri, recent, terribl, ro..."
...,...,...,...,...,...
182526,Many thanks to David Chalmers and to Bill Fis...,zombies,"[many, thanks, david, chalmers, bill, fish, va...","[many, thanks, david, chalmers, bill, fish, va...","[mani, thank, david, chalmer, bill, fish, valu..."
182527,Copyright © 2019 by Robert Kirk &lt;Robert....,zombies,"[copyright, 2019, robert, kirk, ltrobertkirkno...","[copyright, 2019, robert, kirk, ltrobertkirkno...","[copyright, 2019, robert, kirk, ltrobertkirkno..."
182528,View this site from another server:,zombies,"[view, site, another, server]","[view, site, another, server]","[view, site, anoth, server]"
182529,The Stanford Encyclopedia of Philosophy is cop...,zombies,"[stanford, encyclopedia, philosophy, copyright...","[stanford, encyclopedia, philosophy, copyright...","[stanford, encyclopedia, philosophi, copyright..."


In [8]:
# Create a new column 'final_text' from the 'lemmatized_text' tokens.
df['final_text'] = df['lemmatized_text'].apply(lambda tokens: ' '.join(tokens))

In [9]:
# --- Feature Extraction ---
# Set up the TF-IDF vectorizer with unigrams and bigrams.
vectorizer = TfidfVectorizer(
    ngram_range=(1, 3),   # Use unigrams and bigrams.
    max_features=15000,   # Limit to 10,000 terms.
    min_df=7,             # Ignore terms that appear in fewer than 5 documents.
    max_df=0.75,           # Ignore terms that appear in more than 70% of documents.
    use_idf=True,
    sublinear_tf=True
)

# Fit and transform the text.
X_tfidf = vectorizer.fit_transform(df['final_text'])
y = df['category']

In [10]:
# --- Split Data ---
# Use stratification to preserve class balance.
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=42, stratify=y
)

In [11]:
# --- Compute Class Weights ---
classes = np.unique(y_train)
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weight_dict = dict(zip(classes, class_weights))
sample_weight = np.array([class_weight_dict[label] for label in y_train])

In [12]:
# --- Model Training ---
# Initialize the SGDClassifier with loss='log_loss' for logistic regression.
clf = SGDClassifier(
    loss='hinge',     # Logistic regression loss.
    max_iter=1,          # One iteration per call to partial_fit.
    tol=None,            # Disable internal early stopping.
    warm_start=True,     # Retain model state between partial_fit calls.
    n_jobs=-1,
    random_state=42
)

# Train over a fixed number of epochs with a progress bar.
n_epochs = 20
print("Training model with progress bar...")
for epoch in tqdm.tqdm(range(n_epochs), desc="Training epochs"):
    clf.partial_fit(X_train, y_train, sample_weight=sample_weight, classes=classes)

Training model with progress bar...


Training epochs: 100%|██████████| 20/20 [04:54<00:00, 14.73s/it]


In [13]:
# --- Evaluation ---
print("\nMaking predictions on test set...")
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

# --- Explanation of a Specific Category ---
feature_names = np.array(vectorizer.get_feature_names_out())
category_to_explain = 'abelard'  # Change as needed.
if category_to_explain in clf.classes_:
    class_index = list(clf.classes_).index(category_to_explain)
    coefficients = clf.coef_[class_index]
    top_n = 10
    top_features_idx = np.argsort(coefficients)[-top_n:]
    top_features = feature_names[top_features_idx]
    
    print(f"\nTop features for category '{category_to_explain}':")
    for i, feature in enumerate(reversed(top_features)):
        coef = coefficients[top_features_idx[-(i+1)]]
        print(f"{feature}: {coef:.4f}")
else:
    print(f"Category '{category_to_explain}' not found in the model's classes.")


Making predictions on test set...
                                     precision    recall  f1-score   support

                 18thGerman-preKant       0.54      0.41      0.47        17
                          abduction       0.61      0.79      0.69        14
                            abelard       0.81      0.89      0.85        19
                         abhidharma       0.73      0.67      0.70        12
                          abilities       0.79      0.73      0.76        26
                       abner-burgos       0.55      0.75      0.63         8
                          abrabanel       0.82      0.64      0.72        14
                       abraham-daud       0.78      0.78      0.78         9
                   abstract-objects       0.95      0.70      0.81        30
                   abu-bakr-al-razi       0.73      1.00      0.84         8
                             action       0.71      0.59      0.65        17
                  action-perception     

In [14]:
# --- Prediction Function with Explanation ---
def predict_with_explanation(input_text):
    """
    Accepts an input string (expected to be raw text) and returns the predicted category
    along with a human-readable explanation of the decision.
    """
    # Transform the input text using the same vectorizer.
    X_input = vectorizer.transform([input_text])
    predicted_category = clf.predict(X_input)[0]
    
    # Find the index of the predicted category.
    category_index = list(clf.classes_).index(predicted_category)
    # Get the coefficients for that category.
    coefficients = clf.coef_[category_index]
    
    # Compute contributions for each feature: multiply TF-IDF value with the corresponding coefficient.
    # Since X_input is sparse, we use elementwise multiplication.
    contributions = X_input.multiply(coefficients).toarray()[0]
    
    # Get indices of features with the highest positive contributions.
    top_n_explain = 5
    top_indices = np.argsort(contributions)[-top_n_explain:]
    explanation_features = [(feature_names[i], contributions[i]) for i in top_indices if contributions[i] > 0]
    
    explanation_str = f"The input was classified as '{predicted_category}' because it contains indicative features such as: "
    explanation_str += ", ".join([f"{feat} (contribution: {coef:.4f})" for feat, coef in explanation_features])
    return predicted_category, explanation_str

# --- Example Usage ---
example_text = "In Earth's future, a global crop blight and second Dust Bowl are slowly rendering the planet uninhabitable."
pred_cat, explanation = predict_with_explanation(example_text)
print("\nPrediction and Explanation for example text:")
print("Predicted Category:", pred_cat)
print("Explanation:", explanation)


Prediction and Explanation for example text:
Predicted Category: world-government
Explanation: The input was classified as 'world-government' because it contains indicative features such as: future (contribution: 0.0018), planet (contribution: 0.0024), earth (contribution: 0.0119), global (contribution: 0.0261)


### Paragraph on why the non-deep learning method would not work __(without certain changes in criteria as shown above)__

In our attempt to address the problem statement, we found that it was impossible to create viable text generation functionality without the use of a neural network. Initially, we had believed that the use of an ensemble of Hidden Markov Models would be effective. Despite issues in addressing contextual information, it was likely that these models would be able to generate an output that had some level of overlap with the expected output of a prompt. Despite this, the nature of the data proved to be insufficient for this application as it was very large and highly variable. Ultimately, the non‑deep learning methods such as HMMs and n‑gram based models that we implemented lacked the capacity to capture contextual information from long ago, reminiscent of larger context windows. The nuances of language for philosophical texts required coherent text generation. The reliance on local context for these non-deep learning methods meant that they often produced disjointed or repetitive outputs, and they struggled to integrate more complex syntactic structures in the absence of stopwords and pronoun detection. Furthermore, the program was inherently limited by the high dimensionality of traditional bag‑of‑words methods, making it challenging and time consuming to scale to the breadth and depth of a real-world corpus. In contrast, deep neural networks are specifically designed to model sequential and hierarchical language patterns. As such, we implemented a text classification implementation that takes in a sentence and outputs the predicted category with the most important indicative features that led to that conclusion.