### __Exploration__

In [None]:
import re
import numpy as np
import pandas as pd
import tqdm
from datasets import load_dataset

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
from sklearn.linear_model import SGDClassifier

### __Preprocessing__

In [None]:
dataset = load_dataset("AiresPucrs/stanford-encyclopedia-philosophy", split = 'train')

In [None]:
df = dataset.to_pandas()

In [None]:
df.info()

In [None]:
df.drop(columns=['metadata'], inplace=True)

In [None]:
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def preprocess_tokenize(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    return tokens

def remove_stopwords(tokens):
    return [token for token in tokens if token not in stop_words]

df['tokenized_text'] = df['text'].apply(preprocess_tokenize)
df['tokenized_text'] = df['tokenized_text'].apply(remove_stopwords)
df['lemmatized_text'] = df['tokenized_text'].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])
df['stemmed_text'] = df['tokenized_text'].apply(lambda tokens: [stemmer.stem(token) for token in tokens])

In [None]:
df

In [None]:
# Create a new column 'final_text' from the 'lemmatized_text' tokens.
df['final_text'] = df['lemmatized_text'].apply(lambda tokens: ' '.join(tokens))

In [None]:
vectorizer = TfidfVectorizer(
    ngram_range=(1, 3),   # Range of Unigrams to Trigrams.
    max_features=15000,   # Limit to 15,000 terms.
    min_df=7,             # Ignore terms that appear in fewer than 7 documents.
    max_df=0.75,          # Ignore terms that appear in more than 75% of documents.
    use_idf=True,
    sublinear_tf=True
)

X_tfidf = vectorizer.fit_transform(df['final_text'])
y = df['category']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
classes = np.unique(y_train)
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weight_dict = dict(zip(classes, class_weights))
sample_weight = np.array([class_weight_dict[label] for label in y_train])

In [None]:
clf = SGDClassifier(
    loss='hinge',     # Hinge loss.
    max_iter=1,       # One iteration per call to partial_fit.
    tol=None,         # Disable internal early stopping.
    warm_start=True,  # Retain model state between partial_fit calls.
    n_jobs=-1,
    random_state=42
)

n_epochs = 20
for epoch in tqdm.tqdm(range(n_epochs), desc="Training epochs"):
    clf.partial_fit(X_train, y_train, sample_weight=sample_weight, classes=classes)

In [None]:
print("\nMaking predictions on test set...")
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

feature_names = np.array(vectorizer.get_feature_names_out())
category_to_explain = 'abelard'  # Change as needed.
if category_to_explain in clf.classes_:
    class_index = list(clf.classes_).index(category_to_explain)
    coefficients = clf.coef_[class_index]
    top_n = 10
    top_features_idx = np.argsort(coefficients)[-top_n:]
    top_features = feature_names[top_features_idx]
    
    print(f"\nTop features for category '{category_to_explain}':")
    for i, feature in enumerate(reversed(top_features)):
        coef = coefficients[top_features_idx[-(i+1)]]
        print(f"{feature}: {coef:.4f}")
else:
    print(f"Category '{category_to_explain}' not found in the model's classes.")

In [None]:
def predict_with_explanation(input_text):
    X_input = vectorizer.transform([input_text])
    predicted_category = clf.predict(X_input)[0]
    
    category_index = list(clf.classes_).index(predicted_category)
    coefficients = clf.coef_[category_index]
    contributions = X_input.multiply(coefficients).toarray()[0]
    
    top_n_explain = 5
    top_indices = np.argsort(contributions)[-top_n_explain:]
    explanation_features = [(feature_names[i], contributions[i]) for i in top_indices if contributions[i] > 0]
    
    explanation_str = f"The input was classified as '{predicted_category}' because it contains indicative features such as: "
    explanation_str += ", ".join([f"{feat} (contribution: {coef:.4f})" for feat, coef in explanation_features])
    return predicted_category, explanation_str

example_text = "In Earth's future, a global crop blight and second Dust Bowl are slowly rendering the planet uninhabitable."
pred_cat, explanation = predict_with_explanation(example_text)
print("\nPrediction and Explanation for example text:")
print("Predicted Category:", pred_cat)
print("Explanation:", explanation)

### Paragraph on why the non-deep learning method would not work __(without certain changes in criteria as shown above)__

In our attempt to address the problem statement, we found that it was impossible to create viable text generation functionality without the use of a neural network. Initially, we had believed that the use of an ensemble of Hidden Markov Models would be effective. Despite issues in addressing contextual information, it was likely that these models would be able to generate an output that had some level of overlap with the expected output of a prompt. Despite this, the nature of the data proved to be insufficient for this application as it was very large and highly variable. Ultimately, the non‑deep learning methods such as HMMs and n‑gram based models that we implemented lacked the capacity to capture contextual information from long ago, reminiscent of larger context windows. The nuances of language for philosophical texts required coherent text generation. The reliance on local context for these non-deep learning methods meant that they often produced disjointed or repetitive outputs, and they struggled to integrate more complex syntactic structures in the absence of stopwords and pronoun detection. Furthermore, the program was inherently limited by the high dimensionality of traditional bag‑of‑words methods, making it challenging and time consuming to scale to the breadth and depth of a real-world corpus. In contrast, deep neural networks are specifically designed to model sequential and hierarchical language patterns. As such, we implemented a text classification implementation that takes in a sentence and outputs the predicted category with the most important indicative features that led to that conclusion.