In [1]:
import seaborn as sns
import sklearn as sk 
import pandas as pd

In [2]:
df = pd.read_csv('simpsons_dataset.csv').dropna()
df.head()

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...


In [3]:
df['raw_character_text'].value_counts().head()

Homer Simpson          27850
Marge Simpson          13172
Bart Simpson           12995
Lisa Simpson           10756
C. Montgomery Burns     3077
Name: raw_character_text, dtype: int64

Bart has more lines than Lisa????

In [4]:
df_subset = df[(df['raw_character_text']=='Bart Simpson') | (df['raw_character_text'] == 'Lisa Simpson')]
# df.loc[(df["""])] kan ook. 
df_subset.head()

Unnamed: 0,raw_character_text,spoken_words
1,Lisa Simpson,Where's Mr. Bergstrom?
3,Lisa Simpson,That life is worth living.
7,Bart Simpson,Victory party under the slide!
9,Lisa Simpson,Mr. Bergstrom! Mr. Bergstrom!
11,Lisa Simpson,Do you know where I could find him?


In [5]:
from sklearn.feature_extraction.text import CountVectorizer #object you need, you take the model?

#convert it to unicode (standard for text that is used worldwide.) 
# wouldn't work in the CountVectorizer without it
text = df_subset['spoken_words'].values.astype('U')

# Counts all the words in the document 
# It will take out the very frequent words like 'I', 'the', etc. 
# these words are uninformative
# the empty model
vect = CountVectorizer(stop_words='english')
# we create the count of all the words in the text
# fitting the model
vect = vect.fit(text) 
# words that are in the documents are added to a variable.
feature_names = vect.get_feature_names() 
# shows the amount of unique words in the dataset + a sample of the words
print(f"There are {len(feature_names)} words in the vocabulary. A selection: {feature_names[500:520]}")

There are 14257 words in the vocabulary. A selection: ['anguished', 'angus', 'anima', 'animal', 'animals', 'animated', 'animation', 'animators', 'anka', 'ankle', 'ann', 'annapolis', 'anne', 'annie', 'anniversary', 'annnnd', 'announce', 'announcement', 'announcements', 'announcer']


In [6]:
matrix = vect.transform(text) 
print(matrix[0:500,0:500]) 

  (23, 424)	1
  (38, 325)	1
  (43, 266)	1
  (61, 269)	1
  (72, 356)	1
  (78, 264)	1
  (80, 304)	1
  (96, 192)	1
  (98, 396)	1
  (149, 328)	1
  (154, 325)	1
  (155, 451)	1
  (161, 325)	1
  (162, 325)	1
  (184, 461)	1
  (205, 325)	1
  (208, 397)	1
  (229, 270)	1
  (235, 404)	1
  (256, 325)	1
  (284, 325)	1
  (291, 493)	1
  (292, 163)	1
  (315, 300)	1
  (318, 281)	1
  (353, 450)	1
  (355, 397)	1
  (359, 449)	1
  (363, 24)	1
  (363, 449)	1
  (381, 129)	1
  (382, 325)	1
  (383, 70)	1
  (389, 38)	1
  (389, 91)	1
  (391, 446)	1
  (393, 126)	1
  (405, 52)	1
  (405, 319)	1
  (405, 343)	1
  (408, 449)	1
  (414, 196)	1
  (422, 360)	1
  (457, 304)	1


In [7]:
df_sub = pd.DataFrame(matrix.toarray()) 
df_sub.index = df_subset['raw_character_text'] 
df_sub.columns = feature_names

In [8]:
df_sub.iloc[0:4, 1000:1015]

Unnamed: 0_level_0,bartholemew,bartholomew,bartish,bartman,barto,bartrand,bartron,barts,barty,bas,base,baseball,based,basement,basements
raw_character_text,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Lisa Simpson,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Lisa Simpson,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Bart Simpson,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Lisa Simpson,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Start of exercise 2

In [51]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split

y = df_subset['raw_character_text']
X = df_sub.loc[:]

In [52]:
X_train_k, X_test_k, y_train_k, y_test_k = train_test_split(X, y, test_size=0.3)

In [53]:
clf = MultinomialNB()
clf.fit(X_train_k, y_train_k)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [54]:
clf.predict(X_test_k)

array(['Bart Simpson', 'Bart Simpson', 'Bart Simpson', ...,
       'Bart Simpson', 'Lisa Simpson', 'Bart Simpson'], dtype='<U12')

In [55]:
accuracy = clf.score(X_test_k, y_test_k)
print(f' The accuracy is {accuracy}.')

 The accuracy is 0.6451024417625596.
