<a href="https://colab.research.google.com/github/casselnoel/stc510/blob/main/Module5Basics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score 
from nltk.stem.porter import PorterStemmer
import pandas as pd
import numpy as np
import os
import re
from string import punctuation

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
jeopardy = pd.read_json('jeopardy.json') #https://www.dropbox.com/s/x7z8bk4vgowohp9/jeopardy.json?dl=0

In [3]:
jeopardy.head()

Unnamed: 0,category,air_date,question,value,answer,round,show_number
0,HISTORY,2004-12-31,"'For the last 8 years of his life, Galileo was...",$200,Copernicus,Jeopardy!,4680
1,ESPN's TOP 10 ALL-TIME ATHLETES,2004-12-31,'No. 2: 1912 Olympian; football star at Carlis...,$200,Jim Thorpe,Jeopardy!,4680
2,EVERYBODY TALKS ABOUT IT...,2004-12-31,'The city of Yuma in this state has a record a...,$200,Arizona,Jeopardy!,4680
3,THE COMPANY LINE,2004-12-31,"'In 1963, live on ""The Art Linkletter Show"", t...",$200,McDonald\'s,Jeopardy!,4680
4,EPITAPHS & TRIBUTES,2004-12-31,"'Signer of the Dec. of Indep., framer of the C...",$200,John Adams,Jeopardy!,4680


In [4]:
jeopardy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216930 entries, 0 to 216929
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   category     216930 non-null  object
 1   air_date     216930 non-null  object
 2   question     216930 non-null  object
 3   value        213296 non-null  object
 4   answer       216930 non-null  object
 5   round        216930 non-null  object
 6   show_number  216930 non-null  int64 
dtypes: int64(1), object(6)
memory usage: 11.6+ MB


In [5]:
jeopardy.value.value_counts()

$400      42244
$800      31860
$200      30455
$600      20377
$1000     19539
          ...  
$20           1
$2,990        1
$5,700        1
$50           1
$9,800        1
Name: value, Length: 149, dtype: int64

In [6]:
len(jeopardy)

216930

In [7]:
questions = jeopardy['question']

In [8]:
questions.head()

0    'For the last 8 years of his life, Galileo was...
1    'No. 2: 1912 Olympian; football star at Carlis...
2    'The city of Yuma in this state has a record a...
3    'In 1963, live on "The Art Linkletter Show", t...
4    'Signer of the Dec. of Indep., framer of the C...
Name: question, dtype: object

In [12]:
highvalue = jeopardy[jeopardy.value=='$2000']

In [13]:
highvalue.head()

Unnamed: 0,category,air_date,question,value,answer,round,show_number
50,DR. SEUSS AT THE MULTIPLEX,2004-12-31,"'<a href=""http://www.j-archive.com/media/2004-...",$2000,Bartholomew Cubbins,Double Jeopardy!,4680
51,AIRLINE TRAVEL,2004-12-31,'In the seat pocket you'll find the catalog ca...,$2000,Mall,Double Jeopardy!,4680
53,MUSICAL TRAINS,2004-12-31,"'In 1961 James Brown announced ""all aboard"" fo...",$2000,"""Night Train""",Double Jeopardy!,4680
54,"""X""s & ""O""s",2004-12-31,'This 1797 imbroglio began when 3 French agent...,$2000,the XYZ Affair,Double Jeopardy!,4680
110,SCIENCE CLASS,2010-07-06,'Lava & igneous rock are formed from this hot ...,$2000,magma,Double Jeopardy!,5957


In [18]:
#Tokenizing
porter = PorterStemmer()
stop = stopwords.words('english')
def tokenizer_porter(questions):
	return [porter.stem(word) for word in text.split() if word not in stop ]

In [14]:
highvalueqs = highvalue['question']

In [15]:
highvalueqs.head()

50     '<a href="http://www.j-archive.com/media/2004-...
51     'In the seat pocket you'll find the catalog ca...
53     'In 1961 James Brown announced "all aboard" fo...
54     'This 1797 imbroglio began when 3 French agent...
110    'Lava & igneous rock are formed from this hot ...
Name: question, dtype: object

In [16]:
len(highvalueqs)

11243

In [21]:
# Create a new column 'value_weight' with encoded categories can use to classify high or low value with no cleaning of numbers...woot!
jeopardy['value_weight'] = jeopardy['value'].factorize()[0]
value_df = jeopardy[['value', 'value_weight']] #new df containing value amounts and the value wright classification, considering 0-4 low value and 5-8 high value (None was classified as -1)


# Dictionaries for future use
value_to_id = dict(value_df.values)
id_to_question = dict(value_df[['value_weight', 'value']].values)

# New dataframe
jeopardy.head()

Unnamed: 0,category,air_date,question,value,answer,round,show_number,value_weight
0,HISTORY,2004-12-31,"'For the last 8 years of his life, Galileo was...",$200,Copernicus,Jeopardy!,4680,0
1,ESPN's TOP 10 ALL-TIME ATHLETES,2004-12-31,'No. 2: 1912 Olympian; football star at Carlis...,$200,Jim Thorpe,Jeopardy!,4680,0
2,EVERYBODY TALKS ABOUT IT...,2004-12-31,'The city of Yuma in this state has a record a...,$200,Arizona,Jeopardy!,4680,0
3,THE COMPANY LINE,2004-12-31,"'In 1963, live on ""The Art Linkletter Show"", t...",$200,McDonald\'s,Jeopardy!,4680,0
4,EPITAPHS & TRIBUTES,2004-12-31,"'Signer of the Dec. of Indep., framer of the C...",$200,John Adams,Jeopardy!,4680,0


In [23]:
value_df.head()

Unnamed: 0,value,value_weight
0,$200,0
1,$200,0
2,$200,0
3,$200,0
4,$200,0


In [24]:
value_df.tail()

Unnamed: 0,value,value_weight
216925,$2000,8
216926,$2000,8
216927,$2000,8
216928,$2000,8
216929,,-1


In [25]:
value_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216930 entries, 0 to 216929
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   value         213296 non-null  object
 1   value_weight  216930 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 3.3+ MB


In [46]:
jeopardy['high_low'] = jeopardy['value_weight']
jeopardy['high_low'].loc[jeopardy['high_low'] >=5] = 'High'
jeopardy['high_low'].loc[jeopardy['high_low'] == 0] = 'Low'
jeopardy['high_low'].loc[jeopardy['high_low'] == 1] = 'Low'
jeopardy['high_low'].loc[jeopardy['high_low'] == 2] = 'Low'
jeopardy['high_low'].loc[jeopardy['high_low'] == 3] = 'Low'
jeopardy['high_low'].loc[jeopardy['high_low'] == 4] = 'Low'
jeopardy.tail()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Unnamed: 0,category,air_date,question,value,answer,round,show_number,value_weight,high_low
216924,OFF-BROADWAY,2006-05-11,'In 2006 the cast of this long-running hit emb...,$2000,Stomp,Double Jeopardy!,4999,8,High
216925,RIDDLE ME THIS,2006-05-11,'This Puccini opera turns on the solution to 3...,$2000,Turandot,Double Jeopardy!,4999,8,High
216926,"""T"" BIRDS",2006-05-11,'In North America this term is properly applie...,$2000,a titmouse,Double Jeopardy!,4999,8,High
216927,AUTHORS IN THEIR YOUTH,2006-05-11,"'In Penny Lane, where this ""Hellraiser"" grew u...",$2000,Clive Barker,Double Jeopardy!,4999,8,High
216928,QUOTATIONS,2006-05-11,"'From Ft. Sill, Okla. he made the plea, Arizon...",$2000,Geronimo,Double Jeopardy!,4999,8,High


In [48]:
jeopardy=jeopardy.dropna()

In [49]:
jeopardy.high_low.value_counts()

Low     126522
High     86774
Name: high_low, dtype: int64

In [55]:
X_train, X_test, y_train, y_test = train_test_split(jeopardy.question, jeopardy.high_low, 
						random_state=1)

tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_tf = tfidf_vectorizer.fit_transform(X_train)
X_test_tf = tfidf_vectorizer.transform(X_test)

In [58]:
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train_tf, y_train) #training, finding the likelihood of these vectors predicting it being a hard question
predictions = naive_bayes.predict(X_test_tf) #how well doing using naive bayes

print('Accuracy: ', accuracy_score(y_test, predictions)) #how well you predict a question being hard


Accuracy:  0.5944790338309204
