# Cornell Movie Dialogs ChatBot

## We will develop a simple chatbot by training it on Cornell Movie Dialogs corpus containing a large metadata-rich collection of fictional conversations extracted from raw movie scripts

### Data: http://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html

In [1]:
import nltk
import numpy as np
import pandas as pd
import random
import string
import en_core_web_sm
nlp = en_core_web_sm.load()
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import gutenberg
import re
import spacy
import warnings
from sqlalchemy import create_engine
from chatterbot import ChatBot
from chatterbot.trainers import ListTrainer, ChatterBotCorpusTrainer
from chatterbot.conversation import Statement
warnings.filterwarnings("ignore")
nltk.download('gutenberg')
!python -m spacy download en

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\00233270\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


Collecting en_core_web_sm==2.3.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz (12.0 MB)
symbolic link created for C:\Users\00233270\Anaconda3\envs\py37\lib\site-packages\spacy\data\en <<===>> C:\Users\00233270\Anaconda3\envs\py37\lib\site-packages\en_core_web_sm
[+] Download and installation successful
You can now load the model via spacy.load('en_core_web_sm')
[+] Linking successful
C:\Users\00233270\Anaconda3\envs\py37\lib\site-packages\en_core_web_sm -->
C:\Users\00233270\Anaconda3\envs\py37\lib\site-packages\spacy\data\en
You can now load the model via spacy.load('en')


## Get the data

In [2]:
postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'cornell_movie_dialogs'

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))

df0 = pd.read_sql_query('select * from dialogs', con=engine)

# no need for an open connection, 
# as we're only doing a single query
engine.dispose()

In [3]:
nRow, nCol = df0.shape
print(f'There are {nRow} rows and {nCol} columns')

There are 304446 rows and 2 columns


In [4]:
df0.head(4)

Unnamed: 0,index,dialogs
0,0,Can we make this quick? Roxanne Korrine and A...
1,1,"Well, I thought we'd start with pronunciation,..."
2,2,Not the hacking and gagging and spitting part....
3,3,Okay... then how 'bout we try out some French ...


In [10]:
df1 = df0.sample(200000)
#df1 = df0.copy()

In [11]:
df1.shape

(200000, 2)

In [12]:
# Utility function for standard text cleaning
def text_cleaner(text):
    text = re.sub(r'--','',text)
    text = re.sub("[\[]*[\]]", "", text)
    text = re.sub("[\[]*[\]]", "", text)
    text = re.sub(r"(\b|\s+\-?|^\-?)(\d+|\d*\.\d+)\b", " ", text)
    text = ' '.join(text.split())
    return text

In [13]:
# df1['cleaned'] = df1['dialogs'].astype(str).apply(text_cleaner)
# df1['tokens'] = df1['cleaned'].apply(lambda x: nlp(x))

In [14]:
# Convert the text in column to a body of text
dialogs = df1['dialogs'].tolist()
dialogs_doc = ''.join(dialogs)
len(dialogs_doc)

11030388

In [15]:
nlp.max_length = 12000000 # or even higher

In [1]:
dialogs_doc = nlp(dialogs_doc)

NameError: name 'nlp' is not defined

## Break the body to sentences

In [None]:
dialogs_sents = [sent.text for sent in dialogs_doc.sents if len(sent.text) > 1]
dialogs_sents[:10]

## Now we can create our own chatbot and train it using this corpus

In [None]:
# Create a chatbot
chatbot = ChatBot('MovieExpert')
# This is to remove the accumulated knowledge base
chatbot.storage.drop()

# Create a new trainer for the chatbot
trainer = ListTrainer(chatbot)

# Train the chatbot based on Persuasion
trainer.train(dialogs_sents)

### Here we create a function to engage the user if the text contains any greeting words. And if it contains one of them, the chatbot will respond with another greeting word.

In [None]:
GREETING_INPUTS = ["hello", "hi", "greetings", "what's up","hey"]
GREETING_RESPONSES = ["hello", "hi", "hey", "hi there"]
def greeting(sentence):
    for word in sentence.split():
        if word.lower() in GREETING_INPUTS:
            return random.choice(GREETING_RESPONSES)

## Now we run the chatpot

In [None]:
print("MovieExpert: I will try to respond to you reasonably. If you want to exit, type bye.")

# Below is the chatting
while True:
    
    user_input = input("User: ")
    user_input=user_input.lower()
    
    if(user_input!='bye'):
        if(user_input == 'thanks' or user_input == 'thank you'):
            break
            print("MovieExpert: You're welcome.")
        else:
            if(greeting(user_input) != None):
                print("MovieExpert: " + greeting(user_input))
            else:
                print("MovieExpert: ", end = "")
                print(chatbot.get_response(user_input))
    else:
        print("MovieExpert: Bye! It was a great chat.")
        break

In [None]:
# tokens = []
# lemma = []
# #pos = []

# for doc in nlp.pipe(df1['dialogs'].astype('unicode').values, batch_size=50,
#                         n_threads=3):
#     if doc.is_parsed:
#         tokens.append([n.text for n in doc])
#         lemma.append([n.lemma_ for n in doc])
# #        pos.append([n.pos_ for n in doc])
        
#     else:
#         # We want to make sure that the lists of parsed results have the
#         # same number of entries of the original Dataframe, so add some blanks in case the parse fails
#         tokens.append(None)
#         lemma.append(None)
# #        pos.append(None)

# df1['tokens'] = tokens
# df1['lemma'] = lemma
# #df1['pos'] = pos

## Now we can create our own chatbot and train it using Persuasion:

In [None]:
# Create a chatbot
c.
hatbot = ChatBot('Persuasion')
# This is to remove the accumulated knowledge base
chatbot.storage.drop()

# Create a new trainer for the chatbot
trainer = ListTrainer(chatbot)

# Train the chatbot based on Persuasion
trainer.train(persuasion_sents)

## Next, run the chatbot:

In [None]:
print("Persuasion: I will try to respond to you reasonably. If you want to exit, type bye.")

# Below is the chatting
while True:
    
    user_input = input("User: ")
    user_input=user_input.lower()
    
    if(user_input!='bye'):
        if(user_input == 'thanks' or user_input == 'thank you'):
            break
            print("Persuasion: You're welcome.")
        else:
            if(greeting(user_input) != None):
                print("Persuasion: " + greeting(user_input))
            else:
                print("Persuasion: ", end = "")
                print(chatbot.get_response(user_input))
    else:
        print("Persuasion: Bye! It was a great chat.")
        break