<a href="https://colab.research.google.com/github/bradleymclellan/stc510/blob/main/Text_Analysis_Essentials_Ensemble.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import the necessary modules
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('omw-1.4')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from imblearn.under_sampling import RandomUnderSampler
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

In [None]:
# Load Jeopardy! data
with open('jeopardy.json', 'r') as f:
    data = json.load(f)

In [None]:
# Create a DataFrame from the JSON data
df = pd.DataFrame(data)

In [None]:
# Clean the data to remove punctuation, stopwords, and convert to lowercase
stop_words = set(stopwords.words('english'))
df['clean_question'] = df['question'].apply(lambda x: re.sub(r'[^\w\s]', '', x.lower()))
df['clean_question'] = df['clean_question'].apply(lambda x: [word for word in x.split() if word not in stop_words])
df['value'] = df['value'].apply(lambda x: int(re.sub(r'[^\d]', '', x)) if isinstance(x, str) else 0)
df['value_category'] = pd.cut(df['value'], bins=[0, 1000, np.inf], labels=['low', 'high'])
df['high_value'] = np.where(df['value'] >= 800, 1, 0)
df = df.drop(['category', 'air_date', 'show_number', 'value'], axis=1)

In [None]:
# Tokenize and categorize the questions as high- or low-value
df['tokens'] = df['clean_question'].apply(lambda x: nltk.word_tokenize(' '.join(x)))
df['tags'] = df['tokens'].apply(lambda x: nltk.pos_tag(x))
df['categories'] = df['tags'].apply(lambda x: [tag[1] for tag in x])
df['question_type'] = df['categories'].apply(lambda x: 'high_value' if 'CD' in x or 'JJ' in x else 'low_value')

In [None]:
# Define the preprocessing pipeline for the questions
text_preprocessor = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 3))),
])

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['clean_question'], df['question_type'], test_size=0.2, random_state=42)

In [None]:
# Fit a LabelEncoder on the categories and transform the data
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

In [None]:
# Preprocess the data 
X_train_preprocessed = text_preprocessor.fit_transform([' '.join(text) for text in X_train])
X_test_preprocessed = text_preprocessor.transform([' '.join(text) for text in X_test])

In [None]:
# Define the models used in the ensemble classifier
svm_model = SVC(kernel='linear')
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
xgb_model = XGBClassifier(objective='binary:logistic', random_state=42)

In [None]:
# Fit the models on the training data
svm_model.fit(X_train_preprocessed, y_train)
rf_model.fit(X_train_preprocessed, y_train)
xgb_model.fit(X_train_preprocessed, y_train)

In [None]:
# Make predictions on the training and test data
svm_train_preds = svm_model.predict(X_train_preprocessed)
svm_test_preds = svm_model.predict(X_test_preprocessed)

rf_train_preds = rf_model.predict(X_train_preprocessed)
rf_test_preds = rf_model.predict(X_test_preprocessed)

xgb_train_preds = xgb_model.predict(X_train_preprocessed)
xgb_test_preds = xgb_model.predict(X_test_preprocessed)

In [None]:
# Group the predictions together 
models = [('svm', svm_model), ('rf', rf_model), ('xgb', xgb_model)]

In [None]:
# Train and evaluate the models
scores = {}
for model in models:
    name = model[0]
    clf = model[1]
    clf.fit(X_train_preprocessed, y_train)
    y_pred_train = clf.predict(X_train_preprocessed)
    y_pred_test = clf.predict(X_test_preprocessed)
    train_accuracy = accuracy_score(y_train, y_pred_train)
    test_accuracy = accuracy_score(y_test, y_pred_test)
    train_report = classification_report(y_train, y_pred_train)
    test_report = classification_report(y_test, y_pred_test)
    scores[name] = (train_accuracy, test_accuracy, train_report, test_report)

In [None]:
# Define the ensemble model
ensemble = VotingClassifier(estimators=models, voting='hard')
ensemble.fit(X_train_preprocessed, y_train)
ensemble_predictions = ensemble.predict(X_test_preprocessed)
ensemble_accuracy = accuracy_score(y_test, ensemble_predictions)
ensemble_report = classification_report(y_test, ensemble_predictions)

In [None]:
# Print evaluation metrics for individual models and ensemble model
for model in models:
    name = model[0]
    print(name.upper())
    print(f"{name} training accuracy: {scores[name][0]}")
    print(f"{name} testing accuracy: {scores[name][1]}")
    print(f"{name} classification report:\n{scores[name][3]}")
    print('\n')

print("ENSEMBLE")
print(f"Ensemble accuracy: {ensemble_accuracy}")
print(f"Ensemble classification report:\n{ensemble_report}")
print(f"Ensemble precision score: {precision_score(y_test, ensemble_predictions)}")


In [None]:
# Output to a CSV a summary of the overall findings
summary = pd.DataFrame({
    'Model': ['SVM', 'RF', 'XGB', 'Ensemble'],
    'Accuracy': [scores['svm'][1], scores['rf'][1], scores['xgb'][1], ensemble_accuracy],
    'Precision': [precision_score(y_test, svm_model.predict(X_test_preprocessed)), precision_score(y_test, rf_model.predict(X_test_preprocessed)), precision_score(y_test, xgb_model.predict(X_test_preprocessed)), precision_score(y_test, ensemble_predictions, average='weighted')],
    'Recall': [recall_score(y_test, svm_model.predict(X_test_preprocessed)), recall_score(y_test, rf_model.predict(X_test_preprocessed)), recall_score(y_test, xgb_model.predict(X_test_preprocessed)), recall_score(y_test, ensemble_predictions, average='weighted')],
    'F1 Score': [f1_score(y_test, model[1].predict(X_test_preprocessed)) for model in models] + [f1_score(y_test, ensemble_predictions)]
})
summary.to_csv('Ensemble_Model_Summary.csv', index=False)