<a href="https://colab.research.google.com/github/bradleymclellan/stc510/blob/main/Text_Analysis_Basics_NB_Optimize.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import the necessary libraries
import json
import re
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV

In [10]:
# Load Jeopardy! data
with open('jeopardy.json', 'r') as f:
    data = json.load(f)

In [11]:
# Create a DataFrame from the JSON data
df = pd.DataFrame(data)

In [12]:
# Clean the data
df['clean_question'] = df['question'].str.lower().str.replace(r'[^\w\s]','', regex=True).str.strip()
df['clean_answer'] = df['answer'].str.lower().str.replace(r'[^\w\s]','', regex=True).str.strip()
df['clean_question'] = df['question'].apply(lambda x: ' '.join(x.split()))  # split on whitespace and join with space
df['question'] = df['question'].apply(lambda x: ' '.join([word for word in x.split() if word not in nltk.corpus.stopwords.words('english')]))
df['value'] = df['value'].apply(lambda x: int(re.sub(r'[^\d]+', '', x)) if isinstance(x, str) else 0)

In [46]:
# Define the value categories
df['value_category'] = np.where(df['value'].isnull(), 'unknown', np.where(df['value'] < 800, 'low_value', 'high_value'))

In [47]:
# Split the data into training and testing sets
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

In [57]:
# Extract features using TF-IDF vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=20000)
vectors = vectorizer.fit_transform(df['question'])

In [58]:
# Define the Naive Bayes Classifier model
model = MultinomialNB(alpha=1.0, fit_prior=True)

In [66]:
# Define the hyperparameter grid
param_grid = {
    'alpha': [0.01, 0.1, 0.5, 1, 5, 10, 50, 100],
    'fit_prior': [True, False],
}

In [None]:
# Perform GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(model, param_grid=param_grid, cv=5, verbose=3, n_jobs=-1)
grid_search.fit(vectors, df['value_category'])

In [68]:
# Define the Naive Bayes Classifier model with best hyperparameters
best_model = grid_search.best_estimator_

val_vectors = vectorizer.transform(val_df['question'])
test_vectors = vectorizer.transform(test_df['question'])

val_predictions = best_model.predict(val_vectors)
test_predictions = best_model.predict(test_vectors)

In [71]:
# Evaluate the performance of the model
val_accuracy = accuracy_score(val_df['value_category'], val_predictions)
test_accuracy = accuracy_score(test_df['value_category'], test_predictions)

val_confusion_matrix = confusion_matrix(val_df['value_category'], val_predictions)
test_confusion_matrix = confusion_matrix(test_df['value_category'], test_predictions)

In [None]:
# Print the results
print('Validation accuracy:', val_accuracy)
print('Test accuracy:', test_accuracy)

print('Validation confusion matrix:\n', val_confusion_matrix)
print('Test confusion matrix:\n', test_confusion_matrix)

In [74]:
# Output the summary of the findings to a csv file
summary_df = pd.DataFrame({
    'Model': ['Naive Bayes Classifier'],
    'Validation Accuracy': [val_accuracy],
    'Test Accuracy': [test_accuracy],
})
summary_df.to_csv('NB_Optimize_Summary.csv', index=False)