In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import random
import re
from sklearn.metrics import pairwise_distances
from sacremoses import MosesDetokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string
from autocorrect import Speller
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.util import ngrams
from collections import Counter

In [2]:
#Import South Park Script
southpark = pd.read_csv("All-seasons.csv")
southpark.head()

Unnamed: 0,Season,Episode,Character,Line
0,1,1,Boys,"School day, school day, teacher's golden ru...\n"
1,1,1,Kyle,"Ah, damn it! My little brother's trying to fol..."
2,1,1,Ike,Zeeponanner.\n
3,1,1,Kyle,"Ike, you can't come to school with me. \n"
4,1,1,Cartman,"Yeah, go home you little dildo.\n"


# Cartman

In [3]:
#Dialogue spoken by Cartman
cart = southpark['Character'] == 'Cartman'
cartman = southpark[cart]
cartman.head()

Unnamed: 0,Season,Episode,Character,Line
4,1,1,Cartman,"Yeah, go home you little dildo.\n"
8,1,1,Cartman,I know what it means!\n
10,1,1,Cartman,I'm not telling you.\n
13,1,1,Cartman,"He-yeah, that's what Kyle's little brother is ..."
19,1,1,Cartman,That's 'cause I was having these... bogus nigh...


In [4]:
#Reset index
cartman = cartman.reset_index(drop=True)
cartman.head()

Unnamed: 0,Season,Episode,Character,Line
0,1,1,Cartman,"Yeah, go home you little dildo.\n"
1,1,1,Cartman,I know what it means!\n
2,1,1,Cartman,I'm not telling you.\n
3,1,1,Cartman,"He-yeah, that's what Kyle's little brother is ..."
4,1,1,Cartman,That's 'cause I was having these... bogus nigh...


In [5]:
# Expand Contractions
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase
cartman['Line_process'] = cartman['Line'].apply(decontracted)
cartman['Line_process'].head(25)

0                     Yeah, go home you little dildo.\n
1                               I know what it means!\n
2                               I am not telling you.\n
3     He-yeah, that is what Kyle is little brother i...
4     That is 'cause I was having these... bogus nig...
5     Well, I dreamt that I was lying in my bed...  ...
6                                               What?\n
7            No, it was just a dream, my mom said so.\n
8     Oh, shut up guys! You are just trying to make ...
9                                           Kick ass.\n
10                                               Huh?\n
11    Eh, no, that, that was just a dream. And I am ...
12                                                Oh!\n
13                                                Oh!\n
14         No! Uh-I mean, eh, why would they do that?\n
15                                                No!\n
16                                    Shut up, dildo!\n
17                                              

In [6]:
#Lowercase
cartman['Line_process'] = cartman['Line_process'].apply(lambda x: " ".join(x.lower() for x in x.split()))
cartman['Line_process'].head(25)

0                       yeah, go home you little dildo.
1                                 i know what it means!
2                                 i am not telling you.
3     he-yeah, that is what kyle is little brother i...
4     that is 'cause i was having these... bogus nig...
5     well, i dreamt that i was lying in my bed... i...
6                                                 what?
7              no, it was just a dream, my mom said so.
8     oh, shut up guys! you are just trying to make ...
9                                             kick ass.
10                                                 huh?
11    eh, no, that, that was just a dream. and i am ...
12                                                  oh!
13                                                  oh!
14           no! uh-i mean, eh, why would they do that?
15                                                  no!
16                                      shut up, dildo!
17                                              

In [7]:
#Remove numbers
cartman['Line_process'] = cartman['Line_process'].str.replace('\d+', '')
cartman['Line_process'].head(25)

0                       yeah, go home you little dildo.
1                                 i know what it means!
2                                 i am not telling you.
3     he-yeah, that is what kyle is little brother i...
4     that is 'cause i was having these... bogus nig...
5     well, i dreamt that i was lying in my bed... i...
6                                                 what?
7              no, it was just a dream, my mom said so.
8     oh, shut up guys! you are just trying to make ...
9                                             kick ass.
10                                                 huh?
11    eh, no, that, that was just a dream. and i am ...
12                                                  oh!
13                                                  oh!
14           no! uh-i mean, eh, why would they do that?
15                                                  no!
16                                      shut up, dildo!
17                                              

In [None]:
#Spellchecker
spell = Speller(lang='en')
cartman["Line_process"] = [' '.join([spell(i) for i in x.split()]) for x in cartman['Line_process']]
cartman['Line_process'].head(25)

In [None]:
#Remove Stop Words
from nltk.corpus import stopwords
stop = stopwords.words('english')
cartman['Line_process'] = cartman['Line_process'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
cartman['Line_process'].head(25)

In [None]:
#Remove Punctuation
cartman['Line_process'] = cartman['Line_process'].str.replace('[^\w\s]','')
cartman['Line_process'].head(25)

In [None]:
#Tokenize
cartman['word_tokens'] = cartman['Line_process'].apply(word_tokenize)

In [None]:
cartman['word_tokens'].head(25)

In [None]:
#Lemmatize
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(w) for w in text]
cartman['word_tokens'].apply(lemmatize_text).head(25)

In [None]:
#Convert Word Tokens to String
md = MosesDetokenizer(lang = 'en')
cartman['token_string'] = cartman['word_tokens'].apply(lambda x: md.detokenize(x, return_str=True))
cartman['token_string'].head(25)

In [None]:
#Wordcloud
wordcloud_cartman = WordCloud().generate(cartman['word_tokens'].to_string())
plt.figure(figsize=(16, 10))
plt.imshow(wordcloud_cartman)
plt.axis("off")
plt.savefig('Cartman_Wordcloud.png')
plt.title('Cartman Wordcloud')
plt.show()

In [None]:
cartman_lists =  cartman['word_tokens']
cartman_words = []
for cartman_wordList in cartman_lists:
    cartman_words += cartman_wordList
cartman_bigram = ngrams(cartman_words,2)
cartman_bigram_top = Counter(cartman_bigram).most_common(10)
for word, count in cartman_bigram_top:
    print(word, ":", count)

In [None]:
cartman_bi = pd.DataFrame(cartman_bigram_top, columns = ['word', 'frequency'])
cartman_bi.plot(kind='bar', x='word', figsize=(16,10), title='Cartman Bigram Top Words')
plt.savefig('Cartman_Bigram.png')

In [None]:
cartman_trigram = ngrams(cartman_words,3)
cartman_trigram_top = Counter(cartman_trigram).most_common(10)
for word, count in cartman_bigram_top:
    print(word, ":", count)

In [None]:
cartman_tri = pd.DataFrame(cartman_trigram_top, columns = ['word', 'frequency'])
cartman_tri.plot(kind='bar', x='word', figsize=(16,10), title='Cartman Trigram Top Words')
plt.savefig('Cartman_Trigram.png')

In [None]:
#Bag of Word Model

In [None]:
cartman_vec = CountVectorizer()
cartman_bag_of_words = cartman_vec.fit_transform(cartman['token_string'])
cartman_sum_words = cartman_bag_of_words.sum(axis=0)
cartman_words_freq = [(word, cartman_sum_words[0, idx]) for word, idx in cartman_vec.vocabulary_.items()]
cartman_words_freq = np.array(sorted(cartman_words_freq, key = lambda x: x[1], reverse=True))

In [None]:
n = 50
plt.figure(figsize=(16, 10))
plt.barh(-np.arange(n), cartman_words_freq[:n, 1].astype(float), height=.8)
plt.yticks(ticks=-np.arange(n), labels=cartman_words_freq[:n, 0])
plt.savefig('Cartman_BOW.png')
plt.title('Cartman BoW Top Words')
plt.show()

In [None]:
#Create New Dataframe of Bag of Words Array
cartman_bag = cartman_vec.fit_transform(cartman['token_string']).toarray()
cartman_bag_features = cartman_vec.get_feature_names()
cartman_bow = pd.DataFrame(cartman_bag, columns = cartman_bag_features)
cartman_bow.head()

In [None]:
#TF-IDF
cartman_vect = TfidfVectorizer()
cartman_tf = cartman_vect.fit_transform(cartman['token_string'])
cartman_sum_tfidf = cartman_tf.sum(axis=0)
cartman_tfidf_freq = [(word, cartman_sum_tfidf[0, idx]) for word, idx in cartman_vect.vocabulary_.items()]
cartman_tfidf_freq = np.array(sorted(cartman_tfidf_freq, key = lambda x: x[1], reverse=True))

In [None]:
n = 50
plt.figure(figsize=(16, 10))
plt.barh(-np.arange(n), cartman_tfidf_freq[:n, 1].astype(float), height=.8)
plt.yticks(ticks=-np.arange(n), labels=cartman_tfidf_freq[:n, 0])
plt.savefig('Cartman_TFIDF.png')
plt.title('Cartman TFIDF Top Words')
plt.show()

In [None]:
#Create New Dataframe of TFIDF Array
cartman_vectorize = cartman_vect.fit_transform(cartman['token_string']).toarray()
cartman_tfidf_features = cartman_vect.get_feature_names()
cartman_tfidf = pd.DataFrame(cartman_vectorize, columns = cartman_tfidf_features)
cartman_tfidf.head()

In [None]:
#Preprocessing Function
def text_normalization(question):
    #Preprocessing
    #Expand Contractions
    question = decontracted(question)
    #Remove Punctuation
    question = re.sub(r'[^\w\s]','',question)
    #Lowercase
    question = str(question).lower()
    #Remove Numbers
    question = ''.join(i for i in question if not i.isdigit())
    #Spellchecker
    question = spell(question)
    #Tokenize
    question = word_tokenize(question)
    #Remove Stop Words
    question = [i for i in question if i not in stop]
    #Lemmatize
    question = lemmatize_text(question)
    #Detokenize
    question = md.detokenize(question)
    return question

# Kyle

In [None]:
#Dialogue spoken by Kyle
ky = southpark['Character'] == 'Kyle'
kyle = southpark[ky]
kyle.head()

In [None]:
#Reset index
kyle = kyle.reset_index(drop=True)
kyle.head()

In [None]:
# Expand Contractions
kyle['Line_process'] = kyle['Line'].apply(decontracted)
kyle['Line_process'].head(25)

In [None]:
#Lowercase
kyle['Line_process'] = kyle['Line_process'].apply(lambda x: " ".join(x.lower() for x in x.split()))
kyle['Line_process'].head(25)

In [None]:
#Remove numbers
kyle['Line_process'] = kyle['Line_process'].str.replace('\d+', '')
kyle['Line_process'].head(25)

In [None]:
#Spellchecker
kyle["Line_process"] = [' '.join([spell(i) for i in x.split()]) for x in kyle['Line_process']]
kyle['Line_process'].head(25)

In [None]:
#Remove Stop Words
kyle['Line_process'] = kyle['Line_process'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
kyle['Line_process'].head(25)

In [None]:
#Remove Punctuation
kyle['Line_process'] = kyle['Line_process'].str.replace('[^\w\s]','')
kyle['Line_process'].head(25)

In [None]:
#Tokenize
kyle['word_tokens'] = kyle['Line_process'].apply(word_tokenize)

In [None]:
kyle['word_tokens'].head(25)

In [None]:
#Lemmatize
kyle['word_tokens'].apply(lemmatize_text).head(25)

In [None]:
#Convert Word Tokens to String
kyle['token_string'] = kyle['word_tokens'].apply(lambda x: md.detokenize(x, return_str=True))
kyle['token_string'].head(25)

In [None]:
#Wordcloud
wordcloud_kyle = WordCloud().generate(kyle['word_tokens'].to_string())
plt.figure(figsize=(16, 10))
plt.imshow(wordcloud_kyle)
plt.axis("off")
plt.savefig('Kyle_Wordcloud.png')
plt.title('Kyle Wordcloud')
plt.show()

In [None]:
kyle_lists =  kyle['word_tokens']
kyle_words = []
for kyle_wordList in kyle_lists:
    kyle_words += kyle_wordList
kyle_bigram = ngrams(kyle_words,2)
kyle_bigram_top = Counter(kyle_bigram).most_common(10)
for word, count in kyle_bigram_top:
    print(word, ":", count)

In [None]:
kyle_bi = pd.DataFrame(kyle_bigram_top, columns = ['word', 'frequency'])
kyle_bi.plot(kind='bar', x='word', figsize=(16,10), title='Kyle Bigram Top Words')
plt.savefig('Kyle_Bigram.png')

In [None]:
kyle_trigram = ngrams(kyle_words,3)
kyle_trigram_top = Counter(kyle_trigram).most_common(10)
for word, count in stan_trigram_top:
    print(word, ":", count)

In [None]:
kyle_tri = pd.DataFrame(kyle_trigram_top, columns = ['word', 'frequency'])
kyle_tri.plot(kind='bar', x='word', figsize=(16,10), title='Kyle Trigram Top Words')
plt.savefig('Kyle_Trigram.png')

In [None]:
#Bag of Word Model

In [None]:
kyle_vec = CountVectorizer()
kyle_bag = kyle_vec.fit_transform(kyle['token_string'])
kyle_sum_words = kyle_bag.sum(axis=0)
kyle_words_freq = [(word, kyle_sum_words[0, idx]) for word, idx in kyle_vec.vocabulary_.items()]
kyle_words_freq = np.array(sorted(kyle_words_freq, key = lambda x: x[1], reverse=True))

In [None]:
n = 50
plt.figure(figsize=(16, 10))
plt.barh(-np.arange(n), kyle_words_freq[:n, 1].astype(float), height=.8)
plt.yticks(ticks=-np.arange(n), labels=kyle_words_freq[:n, 0])
plt.savefig('Kyle_BOW.png')
plt.title('Kyle BoW Top Words')
plt.show()

In [None]:
#Create New Dataframe of Bag of Words Array
kyle_bag_of_words = kyle_vec.fit_transform(kyle['token_string']).toarray()
kyle_features_bag = kyle_vec.get_feature_names()
kyle_bow = pd.DataFrame(kyle_bag_of_words, columns = kyle_features_bag)
kyle_bow.head()

In [None]:
#TF-IDF
kyle_vect = TfidfVectorizer()
kyle_tfidf = kyle_vect.fit_transform(kyle['token_string'])
kyle_sum_tfidf = kyle_tfidf.sum(axis=0)
kyle_tfidf_freq = [(word, kyle_sum_tfidf[0, idx]) for word, idx in kyle_vect.vocabulary_.items()]
kyle_tfidf_freq = np.array(sorted(kyle_tfidf_freq, key = lambda x: x[1], reverse=True))

In [None]:
n = 50
plt.figure(figsize=(16, 10))
plt.barh(-np.arange(n), kyle_tfidf_freq[:n, 1].astype(float), height=.8)
plt.yticks(ticks=-np.arange(n), labels=kyle_tfidf_freq[:n, 0])
plt.savefig('Kyle_TFIDF.png')
plt.title('Kyle TFIDF Top Words')
plt.show()

In [None]:
#Create New Dataframe of TFIDF Array
kyle_vectorize = kyle_vect.fit_transform(kyle['token_string']).toarray()
kyle_features_tfidf = kyle_vect.get_feature_names()
kyle_tfidf = pd.DataFrame(kyle_vectorize, columns = kyle_features_tfidf)
kyle_tfidf.head()

# Stan

In [None]:
#Dialogue spoken by Stan
sta = southpark['Character'] == 'Stan'
stan = southpark[sta]
stan.head()

In [None]:
#Reset index
stan = stan.reset_index(drop=True)
stan.head()

In [None]:
# Expand Contractions
stan['Line_process'] = stan['Line'].apply(decontracted)
stan['Line_process'].head(25)

In [None]:
#Lowercase
stan['Line_process'] = stan['Line_process'].apply(lambda x: " ".join(x.lower() for x in x.split()))
stan['Line_process'].head(25)

In [None]:
#Remove numbers
stan['Line_process'] = stan['Line_process'].str.replace('\d+', '')
stan['Line_process'].head(25)

In [None]:
#Spellchecker
stan["Line_process"] = [' '.join([spell(i) for i in x.split()]) for x in stan['Line_process']]
stan['Line_process'].head(25)

In [None]:
#Remove Stop Words
stan['Line_process'] = stan['Line_process'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
stan['Line_process'].head(25)

In [None]:
#Remove Punctuation
stan['Line_process'] = stan['Line_process'].str.replace('[^\w\s]','')
stan['Line_process'].head(25)

In [None]:
#Tokenize
stan['word_tokens'] = stan['Line_process'].apply(word_tokenize)

In [None]:
stan['word_tokens'].head(25)

In [None]:
#Lemmatize
stan['word_tokens'].apply(lemmatize_text).head(25)

In [None]:
#Convert Word Tokens to String
stan['token_string'] = stan['word_tokens'].apply(lambda x: md.detokenize(x, return_str=True))
stan['token_string'].head(25)

In [None]:
#Wordcloud
wordcloud_stan = WordCloud().generate(stan['word_tokens'].to_string())
plt.figure(figsize=(16, 10))
plt.imshow(wordcloud_stan)
plt.axis("off")
plt.savefig('Stan_Wordcloud.png')
plt.title('Stan Wordcloud')
plt.show()

In [None]:
stan_lists =  stan['word_tokens']
stan_words = []
for stan_wordList in stan_lists:
    stan_words += stan_wordList
stan_bigram = ngrams(stan_words,2)
stan_bigram_top = Counter(stan_bigram).most_common(10)
for word, count in stan_bigram_top:
    print(word, ":", count)

In [None]:
stan_bi = pd.DataFrame(stan_bigram_top, columns = ['word', 'frequency'])
stan_bi.plot(kind='bar', x='word', figsize=(16,10), title='Stan Bigram Top Words')
plt.savefig('Stan_Bigram.png')

In [None]:
stan_trigram = ngrams(stan_words,3)
stan_trigram_top = Counter(stan_trigram).most_common(10)
for word, count in stan_trigram_top:
    print(word, ":", count)

In [None]:
stan_tri = pd.DataFrame(stan_trigram_top, columns = ['word', 'frequency'])
stan_tri.plot(kind='bar', x='word', figsize=(16,10), title='Stan Trigram Top Words')
plt.savefig('Stan_Trigram.png')

In [None]:
#Bag of Word Model

In [None]:
stan_vec = CountVectorizer()
stan_bag_of_words = stan_vec.fit_transform(stan['token_string'])
stan_sum_words = stan_bag_of_words.sum(axis=0)
stan_words_freq = [(word, stan_sum_words[0, idx]) for word, idx in stan_vec.vocabulary_.items()]
stan_words_freq = np.array(sorted(stan_words_freq, key = lambda x: x[1], reverse=True))

In [None]:
n = 50
plt.figure(figsize=(16, 10))
plt.barh(-np.arange(n), stan_words_freq[:n, 1].astype(float), height=.8)
plt.yticks(ticks=-np.arange(n), labels=stan_words_freq[:n, 0])
plt.savefig('Stan_BOW.png')
plt.title('Stan BoW Top Words')
plt.show()

In [None]:
#Create New Dataframe of Bag of Words Array
stan_bag_of_words = stan_vec.fit_transform(stan['token_string']).toarray()
stan_bow_features = stan_vec.get_feature_names()
stan_bow = pd.DataFrame(stan_bag_of_words, columns = stan_bow_features)
stan_bow.head()

In [None]:
#TF-IDF
stan_vect = TfidfVectorizer()
stan_tfidf = stan_vect.fit_transform(stan['token_string'])
stan_sum_tfidf = stan_tfidf.sum(axis=0)
stan_tfidf_freq = [(word, stan_sum_tfidf[0, idx]) for word, idx in stan_vect.vocabulary_.items()]
stan_tfidf_freq = np.array(sorted(stan_tfidf_freq, key = lambda x: x[1], reverse=True))

In [None]:
n = 50
plt.figure(figsize=(16, 10))
plt.barh(-np.arange(n), stan_tfidf_freq[:n, 1].astype(float), height=.8)
plt.yticks(ticks=-np.arange(n), labels=stan_tfidf_freq[:n, 0])
plt.savefig('Stan_TFIDF.png')
plt.title('Stan TFIDF Top Words')
plt.show()

In [None]:
#Create New Dataframe of TFIDF Array
stan_vectorize = stan_vect.fit_transform(stan['token_string']).toarray()
stan_tfidf_features = stan_vect.get_feature_names()
stan_tfidf = pd.DataFrame(stan_vectorize, columns = stan_tfidf_features)
stan_tfidf.head()

# Chatbots

In [None]:
#Chat Function Using Bag of Words Model
def cartman_chat_bow(question):
    cartman_lemma = text_normalization(question)
    cartman_bagg = cartman_vec.transform([cartman_lemma]).toarray()
    cartman_cos = 1 - pairwise_distances(cartman_bow, cartman_bagg, metric = 'cosine')
    cartman_index_val = cartman_cos.argmax()
    return cartman['Line'].loc[cartman_index_val]

In [None]:
#Chat Function Using TFIDF Model
def cartman_chat_tfidf(question):
    cartman_lem = text_normalization(question)
    cartman_tf = cartman_vect.transform([cartman_lem]).toarray()
    cartman_cosine = 1 - pairwise_distances(cartman_bow, cartman_tf, metric = 'cosine')
    cartman_index_value = cartman_cosine.argmax()
    return cartman['Line'].loc[cartman_index_value]

In [None]:
#Chat Function Using Bag of Words Model
def kyle_chat_bow(question):
    kyle_lemma = text_normalization(question)
    kyle_bagg = kyle_vec.transform([kyle_lemma]).toarray()
    kyle_cos = 1 - pairwise_distances(kyle_bow, kyle_bagg, metric = 'cosine')
    kyle_index_val = kyle_cos.argmax()
    return kyle['Line'].loc[kyle_index_val]

In [None]:
#Chat Function Using TFIDF Model
def kyle_chat_tfidf(question):
    kyle_lem = text_normalization(question)
    kyle_tf = kyle_vect.transform([kyle_lem]).toarray()
    kyle_cosine = 1 - pairwise_distances(kyle_bow, kyle_tf, metric = 'cosine')
    kyle_index_value = kyle_cosine.argmax()
    return kyle['Line'].loc[kyle_index_value]

In [None]:
#Chat Function Using Bag of Words Model
def stan_chat_bow(question):
    stan_lemma = text_normalization(question)
    stan_bagg = vec.transform([lemma]).toarray()
    stan_cos = 1 - pairwise_distances(stan_bow, stan_bagg, metric = 'cosine')
    stan_index_val = stan_cos.argmax()
    return stan['Line'].loc[stan_index_val]

In [None]:
#Chat Function Using TFIDF Model
def stan_chat_tfidf(question):
    stan_lem = text_normalization(question)
    stan_tf = stan_vect.transform([stan_lem]).toarray()
    stan_cosine = 1 - pairwise_distances(stan_bow, stan_tf, metric = 'cosine')
    stan_index_value = stan_cosine.argmax()
    return stan['Line'].loc[stan_index_value]

In [None]:
#Chatbots Talking to Each Other

In [None]:
question = input('What is your question?')