<a href="https://colab.research.google.com/github/asnoldy02-cell/sds510/blob/main/Module_5_Essentials.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import files
import pandas as pd
import json
import io
import numpy as np
import re

print("Upload your jeopardy.json file...")
uploaded = files.upload()

filename = next(iter(uploaded))
raw_text = uploaded[filename].decode("utf-8")


Upload your jeopardy.json file...


Saving jeopardy.json to jeopardy.json


In [2]:
try:
    data = json.loads(raw_text)
    df = pd.DataFrame(data)
    print("Loaded JSON as a list of objects.")
except Exception:

    df = pd.read_json(io.BytesIO(uploaded[filename]), lines=True)
    print("Loaded JSON as line-delimited NDJSON.")

print("\nPreview of data:")
print(df.head())
print("\nColumns:", df.columns.tolist())


Loaded JSON as a list of objects.

Preview of data:
                          category    air_date  \
0                          HISTORY  2004-12-31   
1  ESPN's TOP 10 ALL-TIME ATHLETES  2004-12-31   
2      EVERYBODY TALKS ABOUT IT...  2004-12-31   
3                 THE COMPANY LINE  2004-12-31   
4              EPITAPHS & TRIBUTES  2004-12-31   

                                            question value       answer  \
0  'For the last 8 years of his life, Galileo was...  $200   Copernicus   
1  'No. 2: 1912 Olympian; football star at Carlis...  $200   Jim Thorpe   
2  'The city of Yuma in this state has a record a...  $200      Arizona   
3  'In 1963, live on "The Art Linkletter Show", t...  $200  McDonald\'s   
4  'Signer of the Dec. of Indep., framer of the C...  $200   John Adams   

       round show_number  
0  Jeopardy!        4680  
1  Jeopardy!        4680  
2  Jeopardy!        4680  
3  Jeopardy!        4680  
4  Jeopardy!        4680  

Columns: ['category', 'air_date',

In [3]:
def clean_text(text):
    if pd.isnull(text):
        return ""
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

question_col = None
for col in df.columns:
    if col.lower() == "question":
        question_col = col

if question_col is None:
    raise ValueError("No 'question' or 'Question' column found in JSON.")

df["Question"] = df[question_col].apply(clean_text)


In [4]:
value_col = None
for col in df.columns:
    if col.lower() == "value":
        value_col = col

if value_col is None:
    raise ValueError("No 'value' or 'Value' column found in JSON.")

def parse_value(val):
    if isinstance(val, str):
        val = val.replace("$", "").replace(",", "")
        if val.lower() == "none" or val == "":
            return np.nan
        try:
            return int(val)
        except:
            return np.nan
    return np.nan

df["ValueNum"] = df[value_col].apply(parse_value)
df = df.dropna(subset=["ValueNum"])

median_value = df["ValueNum"].median()
df["HighValue"] = (df["ValueNum"] > median_value).astype(int)

print("\nMedian value:", median_value)
print("Class counts:\n", df["HighValue"].value_counts())



Median value: 600.0
Class counts:
 HighValue
0    119805
1     93491
Name: count, dtype: int64


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

X = df["Question"]
y = df["HighValue"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1,2), min_df=2)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [6]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

nb = MultinomialNB()
nb.fit(X_train_vec, y_train)

nb_preds = nb.predict(X_test_vec)
nb_acc = accuracy_score(y_test, nb_preds)

print("\n===================================")
print("Naive Bayes Classifier Performance")
print("===================================")
print(f"Accuracy: {nb_acc:.4f}\n")
print(classification_report(y_test, nb_preds))


Naive Bayes Classifier Performance
Accuracy: 0.5895

              precision    recall  f1-score   support

           0       0.59      0.84      0.70     23961
           1       0.57      0.26      0.36     18699

    accuracy                           0.59     42660
   macro avg       0.58      0.55      0.53     42660
weighted avg       0.58      0.59      0.55     42660



In [7]:
from sklearn.svm import LinearSVC

svm_clf = LinearSVC()
svm_clf.fit(X_train_vec, y_train)

svm_preds = svm_clf.predict(X_test_vec)
svm_acc = accuracy_score(y_test, svm_preds)

print("\n===================================")
print("Linear SVM Classifier Performance")
print("===================================")
print(f"Accuracy: {svm_acc:.4f}\n")
print(classification_report(y_test, svm_preds))



Linear SVM Classifier Performance
Accuracy: 0.5719

              precision    recall  f1-score   support

           0       0.61      0.65      0.63     23961
           1       0.51      0.47      0.49     18699

    accuracy                           0.57     42660
   macro avg       0.56      0.56      0.56     42660
weighted avg       0.57      0.57      0.57     42660



In [8]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense


In [9]:
tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_tok = tokenizer.texts_to_sequences(X_train)
X_test_tok = tokenizer.texts_to_sequences(X_test)

max_len = 40
X_train_pad = pad_sequences(X_train_tok, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_tok, maxlen=max_len, padding='post')


In [10]:
model = Sequential([
    Embedding(input_dim=20000, output_dim=64, input_length=max_len),
    LSTM(64, return_sequences=False),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



In [13]:
history = model.fit(
    X_train_pad,
    y_train,
    validation_split=0.2,
    epochs=4,
    batch_size=64
)

Epoch 1/4
[1m2133/2133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m121s[0m 57ms/step - accuracy: 0.7421 - loss: 0.5036 - val_accuracy: 0.5747 - val_loss: 0.7974
Epoch 2/4
[1m2133/2133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 52ms/step - accuracy: 0.7771 - loss: 0.4451 - val_accuracy: 0.5629 - val_loss: 0.8730
Epoch 3/4
[1m2133/2133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 53ms/step - accuracy: 0.8160 - loss: 0.3812 - val_accuracy: 0.5476 - val_loss: 0.9697
Epoch 4/4
[1m2133/2133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 51ms/step - accuracy: 0.8491 - loss: 0.3189 - val_accuracy: 0.5524 - val_loss: 1.1354


In [14]:
lstm_preds = (model.predict(X_test_pad) > 0.5).astype(int).reshape(-1)
lstm_acc = accuracy_score(y_test, lstm_preds)

print("\n===================================")
print("LSTM Deep Learning Classifier")
print("===================================")
print(f"Accuracy: {lstm_acc:.4f}\n")
print(classification_report(y_test, lstm_preds))


[1m1334/1334[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 7ms/step

LSTM Deep Learning Classifier
Accuracy: 0.5585

              precision    recall  f1-score   support

           0       0.60      0.62      0.61     23961
           1       0.50      0.48      0.49     18699

    accuracy                           0.56     42660
   macro avg       0.55      0.55      0.55     42660
weighted avg       0.56      0.56      0.56     42660

