In [0]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import warnings
import matplotlib.pyplot as plt
from scipy.stats.mstats import winsorize
import scipy.stats as stats
import seaborn as sns
import random
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import spacy
warnings.filterwarnings("ignore")


postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'cornell_movie_dialogs'

engine2 = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))

dialogs_df = pd.read_sql_query('select * from dialogs',con=engine2)

# no need for an open connection, 
# as we're only doing a single query
engine2.dispose()

In [3]:
dialogs_df.head(10)

Unnamed: 0,index,dialogs
0,0,Can we make this quick? Roxanne Korrine and A...
1,1,"Well, I thought we'd start with pronunciation,..."
2,2,Not the hacking and gagging and spitting part....
3,3,Okay... then how 'bout we try out some French ...
4,4,You're asking me out. That's so cute. What's ...
5,5,Forget it.
6,6,"No, no, it's my fault -- we didn't have a prop..."
7,7,Cameron.
8,8,"The thing is, Cameron -- I'm at the mercy of a..."
9,9,Seems like she could get a date easy enough...


## Applying NLP data processing on the dataset

Steps include

*   Removing Metacharacter
*   Removing Whitespace
*   Tokenization
*   Removing Stopwords
*   Lemmatization





In [4]:
from collections import Counter
import nltk
import spacy
import re

# Download the English models of SpaCy
!python -m spacy download en

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [5]:
# Converting dialog datafram into array for NLP processing
dialogs_arr = np.array(dialogs_df['dialogs'])

# print the first 100 documents of dailogs
#print('\nRaw:\n', dialogs_arr[0:20])
for ind in dialogs_arr[0:100]:
  print(ind)

Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.
Well, I thought we'd start with pronunciation, if that's okay with you.
Not the hacking and gagging and spitting part.  Please.
Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?
You're asking me out.  That's so cute. What's your name again?
Forget it.
No, no, it's my fault -- we didn't have a proper introduction ---
Cameron.
The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.
Seems like she could get a date easy enough...
Why?
Unsolved mystery.  She used to be really popular when she started high school, then it was just like she got sick of it or something.
That's a shame.
Gosh, if only we could find Kat a boyfriend...
Let me see what I can do.
C'esc ma tete. This is my head
Right.  See?  You're ready for the quiz.
I don't want to know how to say that thoug

In [6]:
# Uisng Regular Expression to remove some metacharacter
dialogs_arr_cln = []
for ind in dialogs_arr[0:100]:
  pattern2 = "[\-\?\,\"\!\[...\]]"
  dialogs_arr_cln.append(re.sub(pattern2, "", ind))
  print(re.sub(pattern2, "", ind))

Can we make this quick  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break up on the quad  Again
Well I thought we'd start with pronunciation if that's okay with you
Not the hacking and gagging and spitting part  Please
Okay then how 'bout we try out some French cuisine  Saturday  Night
You're asking me out  That's so cute What's your name again
Forget it
No no it's my fault  we didn't have a proper introduction 
Cameron
The thing is Cameron  I'm at the mercy of a particularly hideous breed of loser  My sister  I can't date until she does
Seems like she could get a date easy enough
Why
Unsolved mystery  She used to be really popular when she started high school then it was just like she got sick of it or something
That's a shame
Gosh if only we could find Kat a boyfriend
Let me see what I can do
C'esc ma tete This is my head
Right  See  You're ready for the quiz
I don't want to know how to say that though  I want to know useful things Like where the goo

In [7]:
# Removing white spacing
dialogs_arr_cln2 = []
for ind in dialogs_arr_cln[0:100]:
#for ind in dialogs_arr[0:100]:
  dialogs_arr_cln2.append(' '.join(ind.split()))
  print(' '.join(ind.split()))

Can we make this quick Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break up on the quad Again
Well I thought we'd start with pronunciation if that's okay with you
Not the hacking and gagging and spitting part Please
Okay then how 'bout we try out some French cuisine Saturday Night
You're asking me out That's so cute What's your name again
Forget it
No no it's my fault we didn't have a proper introduction
Cameron
The thing is Cameron I'm at the mercy of a particularly hideous breed of loser My sister I can't date until she does
Seems like she could get a date easy enough
Why
Unsolved mystery She used to be really popular when she started high school then it was just like she got sick of it or something
That's a shame
Gosh if only we could find Kat a boyfriend
Let me see what I can do
C'esc ma tete This is my head
Right See You're ready for the quiz
I don't want to know how to say that though I want to know useful things Like where the good stores are Ho

# Creating Chatbot using Processed Movie Dialog
1. Self Coding

In [8]:
dialogs_arr_cln2

['Can we make this quick Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break up on the quad Again',
 "Well I thought we'd start with pronunciation if that's okay with you",
 'Not the hacking and gagging and spitting part Please',
 "Okay then how 'bout we try out some French cuisine Saturday Night",
 "You're asking me out That's so cute What's your name again",
 'Forget it',
 "No no it's my fault we didn't have a proper introduction",
 'Cameron',
 "The thing is Cameron I'm at the mercy of a particularly hideous breed of loser My sister I can't date until she does",
 'Seems like she could get a date easy enough',
 'Why',
 'Unsolved mystery She used to be really popular when she started high school then it was just like she got sick of it or something',
 "That's a shame",
 'Gosh if only we could find Kat a boyfriend',
 'Let me see what I can do',
 "C'esc ma tete This is my head",
 "Right See You're ready for the quiz",
 "I don't want to know how to say that

In [0]:
# Greeting Response
GREETING_INPUTS = ["hello", "hi", "greetings", "what's up","hey"]
GREETING_RESPONSES = ["hello", "hi", "hey", "hi there"]
def greeting(sentence):
    for word in sentence.split():
        if word.lower() in GREETING_INPUTS:
            return random.choice(GREETING_RESPONSES)

In [0]:
def response(user_input):
    
    response = ""
    persuasion_sents = dialogs_arr_cln2
    # we parse the user's input using SpaCy
    nlp = spacy.load('en')
    input_doc = nlp(user_input)
    # then we split it into sentences
    input_sents = [sent.text for sent in input_doc.sents]
    # then we append the user's sentence into our list of sentences
    for sentence in input_sents:
        persuasion_sents.append(sentence)
    
    # the next step is to vectorize our new corpus using tf-idf
    TfidfVec = TfidfVectorizer(max_df=0.5, min_df=1, use_idf=True, norm=u'l2', smooth_idf=True, lowercase=False)
    tfidf = TfidfVec.fit_transform(persuasion_sents)
    
    # remove the user's input from the corpus
    persuasion_sents.pop(-1)
    
    # we calculate the cosine similarity
    # between the user input and all the other sentences in the corpus
    similarities = cosine_similarity(tfidf[-1], tfidf[:-1])
    # we get the index of most similar sentence
    idx = np.argmax(similarities)
        
    if(idx):
        response = response + persuasion_sents[idx]
        return response
    else:
        response = response + "I'm sorry! I don't know how to respond :("
        return response

In [21]:
print("Persuasion: I will try to respond you reasonably. If you want to exit, type bye please.")

while(True):
    
    user_input = input("User: ")
    user_input=user_input.lower()
    
    if(user_input!='bye'):
        if(user_input == 'thanks' or user_input == 'thank you'):
            break
            print("Persuasion: You're welcome.")
        else:
            if(greeting(user_input) != None):
                print("Persuasion: " + greeting(user_input))
            else:
                print("Persuasion: ", end = "")
                print(response(user_input))
    else:
        print("Persuasion: Bye! It was a great chat.")
        break

Persuasion: I will try to respond you reasonably. If you want to exit, type bye please.
User: hi 
Persuasion: hello
User: how are you
Persuasion: I'm kidding You know how sometimes you just become this persona And you don't know how to quit
User: ok what kind of persona you are talking about
Persuasion: So that's the kind of guy she likes Pretty ones
User: Pretty ones?
Persuasion: So that's the kind of guy she likes Pretty ones
User: kind of guys
Persuasion: So that's the kind of guy she likes Pretty ones
User: so
Persuasion: I hope so
User: hope
Persuasion: I hope so
User: what are you talking about
Persuasion: Listen I want to talk to you about the prom
User: prom?
Persuasion: Listen I want to talk to you about the prom
User: ok continue
Persuasion: I'm sorry! I don't know how to respond :(
User: ok
Persuasion: I'm sorry! I don't know how to respond :(
User: fine
Persuasion: I'm sorry! I don't know how to respond :(
User: theek hai
Persuasion: I'm sorry! I don't know how to respond :

Strengths
1. Works seamlessly.
2. Incorporate responses from the movie dialogues.

Weakness
1. Is case sensitive doesn't work if cases dont match.
2. Data processing on dialogues effects the results.