In [37]:
import pandas as pd
import numpy as np
import csv
import matplotlib.pyplot as plt
import pickle
import tensorflow as tf
import tensorflow_datasets as tfds
import re

In [38]:

# Movie dialog corpus
# https://www.kaggle.com/datasets/Cornell-University/movie-dialog-corpus
movie_lines = pd.read_csv(
    "movie_lines.tsv", 
    encoding='utf-8-sig', 
    sep='\t', 
    on_bad_lines="skip", 
    header = None,
    names = ['lineID', 'charID', 'movieID', 'charName', 'text'],
    index_col=['lineID']
)

movie_conversations = pd.read_csv(
    "movie_conversations.tsv", 
    sep='\t', 
    encoding='ISO-8859-2',
    names = ['charID_1', 'charID_2', 'movieID', 'conversation']
)

movie_conversations['conversation'] = movie_conversations['conversation'].map(lambda x: re.findall(r"\w+", x))
print(movie_lines.head(5))
print(movie_conversations.head(5))

       charID movieID charName          text
lineID                                      
L1045      u0      m0   BIANCA  They do not!
L1044      u2      m0  CAMERON   They do to!
L985       u0      m0   BIANCA    I hope so.
L984       u2      m0  CAMERON     She okay?
L925       u0      m0   BIANCA     Let's go.
  charID_1 charID_2 movieID              conversation
0       u0       u2      m0  [L194, L195, L196, L197]
1       u0       u2      m0              [L198, L199]
2       u0       u2      m0  [L200, L201, L202, L203]
3       u0       u2      m0        [L204, L205, L206]
4       u0       u2      m0              [L207, L208]


In [39]:

movie_lines = movie_lines[['text']]
movie_lines['response'] = np.nan

def add_responses(conversations_df, lines):
    for convo in conversations_df['conversation']:
        if isinstance(convo, list):
            for lineID in convo:
                line_number = convo.index(lineID)

                if line_number + 1 < len(convo) and lineID in lines.index and convo[line_number + 1] in lines.index:
                    next_line = lines.loc[convo[line_number + 1], 'text']
                    movie_lines.loc[lineID, 'response'] = next_line
        
add_responses(movie_conversations, movie_lines)
movie_lines = movie_lines.reset_index()[['text', 'response']]


In [40]:

# Daily Dialogues corpus
# https://www.kaggle.com/datasets/thedevastator/dailydialog-unlock-the-conversation-potential-in
dialogues = []

with open('dialogues_text.txt', 'r', encoding='utf-8') as file:
    for line in file:
        dialogues.append(line)

print(f'length: {len(dialogues)}')
dialogues[:2]

length: 13118


["The kitchen stinks . __eou__ I'll throw out the garbage . __eou__\n",
 'So Dick , how about getting some coffee for tonight ? __eou__ Coffee ? I don ’ t honestly like that kind of stuff . __eou__ Come on , you can at least try a little , besides your cigarette . __eou__ What ’ s wrong with that ? Cigarette is the thing I go crazy for . __eou__ Not for me , Dick . __eou__\n']

In [41]:

conversation_ids = []
sentences = []

# Create pandas df with texts
def create_text_column(conversation, curr_idx):
    sentences_list = conversation.strip().split("__eou__")
    
    # Add conversation ID and sentences to the lists
    conversation_ids.extend([idx] * (len(sentences_list) - 1))
    sentences.extend(sentences_list[:-1])
    
for idx, conversation in enumerate(dialogues):
    create_text_column(conversation, idx)
    
# Create dataframe with lines from conversations and conversation id numbers
daily_df = pd.DataFrame({"ConversationID": conversation_ids, "text": sentences})


In [42]:

daily_df['response'] = np.nan

# Create rows with a question/statement and a response
def create_daily_response_column(df):
    for index, row in df.iterrows():
        next_index = index + 1
        
        if next_index < len(df) and df.loc[next_index, 'ConversationID'] == row['ConversationID']:
            df.loc[index, 'response'] = df.loc[next_index, 'text']
    return df

daily_conversation_df = create_daily_response_column(daily_df.copy())


In [43]:

# Conversation dataset
conversation_dataset = pd.read_csv('Conversation.csv')
new_column_names = {'question':'text', 'answer': 'response'}
conversation_dataset.rename(columns=new_column_names, inplace=True)
conversation_dataset = conversation_dataset[['text', 'response']]

# Duplicate dataset a few times
conversation_dataset_duplicated = pd.concat([conversation_dataset] * 2, ignore_index=True)
conversation_dataset = pd.concat([conversation_dataset, conversation_dataset_duplicated], ignore_index=True)
conversation_dataset_duplicated = pd.concat([conversation_dataset] * 2, ignore_index=True)
conversation_dataset = pd.concat([conversation_dataset, conversation_dataset_duplicated], ignore_index=True)
conversation_dataset_duplicated = pd.concat([conversation_dataset] * 2, ignore_index=True)
conversation_dataset = pd.concat([conversation_dataset, conversation_dataset_duplicated], ignore_index=True)

conversation_dataset[:2]

Unnamed: 0,text,response
0,"hi, how are you doing?",i'm fine. how about yourself?
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.


In [44]:

# Human conversation training data corpus: 
# https://www.kaggle.com/datasets/projjal1/human-conversation-training-data/data
dialogues = []

with open('human_chat.txt', 'r', encoding='utf-8') as file:
    for line in file:
        dialogues.append(line)

print(f'length: {len(dialogues)}')

human_conversation_df = pd.DataFrame({ 'text': dialogues })
human_conversation_df.head(5)

length: 1495


Unnamed: 0,text
0,Human 1: Hi!\n
1,Human 2: What is your favorite holiday?\n
2,Human 1: one where I get to meet lots of diffe...
3,Human 2: What was the most number of people yo...
4,Human 1: Hard to keep a count. Maybe 25.\n


In [45]:

human_conversation_df['response'] = np.nan

# Get rid of human 1 and 2 labels
human_conversation_df['text'] = human_conversation_df['text'].str[9:]

# Create rows with a question/statement and a response
def create_response_column(df):
    for index, row in df.iterrows():
        next_index = index + 1
        
        if next_index < len(df):
            df.loc[index, 'response'] = df.loc[next_index, 'text']
    return df

human_conversation_df = create_response_column(human_conversation_df)
human_conversation_df[:2]

Unnamed: 0,text,response
0,Hi!\n,What is your favorite holiday?\n
1,What is your favorite holiday?\n,one where I get to meet lots of different peop...


In [46]:

# NPR Media Dialog Transcripts: 
# https://www.kaggle.com/datasets/shuyangli94/interview-npr-media-dialog-transcripts
npr_df = pd.read_csv('utterances.csv')
npr_df = npr_df.head(1200000)
npr_df = npr_df[['episode_order', 'utterance']]
new_column_name = {'utterance':'text'}
npr_df.rename(columns=new_column_name, inplace=True)
npr_df[:5]

Unnamed: 0,episode_order,text
0,9,"It's a 2,200-mile race. To give some sense of ..."
1,10,So for a top competitor like Lance to try to m...
2,11,"So in every team, presumably there's one star,..."
3,12,That's right. Each team has nine riders. And w...
4,13,"So slipstream, this is like drafting in car ra..."


In [47]:
npr_df['response'] = np.nan

# Create rows with a question/statement and a response
def create_response_column(df):
    for index, row in df.iterrows():
        next_index = index + 1
        
        if next_index < len(df) and df.loc[next_index, 'episode_order'] > df.loc[index, 'episode_order']:
            df.loc[index, 'response'] = df.loc[next_index, 'text']
    return df

npr_conversations = create_response_column(npr_df.copy())
npr_conversations[:10]

Unnamed: 0,episode_order,text,response
0,9,"It's a 2,200-mile race. To give some sense of ...",So for a top competitor like Lance to try to m...
1,10,So for a top competitor like Lance to try to m...,"So in every team, presumably there's one star,..."
2,11,"So in every team, presumably there's one star,...",That's right. Each team has nine riders. And w...
3,12,That's right. Each team has nine riders. And w...,"So slipstream, this is like drafting in car ra..."
4,13,"So slipstream, this is like drafting in car ra...",That's exactly right.
5,14,That's exactly right.,And so the guy who's in back has an easier tim...
6,15,And so the guy who's in back has an easier tim...,That's right. There's a lot of deal making tha...
7,16,That's right. There's a lot of deal making tha...,"We're talking with Loren Mooney, the editor-in..."
8,17,"We're talking with Loren Mooney, the editor-in...",Hello.
9,18,Hello.,"Hi, John."


Here I concatenate all the above dataframes into one and use just the text and response columns.

In [48]:

# Combine all datasets into one df
conversations_df = pd.concat([movie_lines, daily_conversation_df, human_conversation_df, conversation_dataset, npr_conversations], axis=0)
conversations_df[:5]


Unnamed: 0,text,response,ConversationID,episode_order
0,They do not!,,,
1,They do to!,They do not!,,
2,I hope so.,,,
3,She okay?,I hope so.,,
4,Let's go.,,,


In [49]:

# Drop rows with empty responses
conversations_df = conversations_df.dropna(subset = ['response'])

# Take only text and response columns
conversations_df = conversations_df[['text', 'response']]

# Make sure text and response columns are all strings
conversations_df[['text', 'response']] = conversations_df[['text', 'response']].astype(str)
conversations_df[['text', 'response']] = conversations_df[['text', 'response']].astype(str)

print(len(conversations_df))
conversations_df.head(5)


1529863


Unnamed: 0,text,response
1,They do to!,They do not!
3,She okay?,I hope so.
5,Wow,Let's go.
7,No,Okay -- you're gonna need to learn how to lie.
12,I figured you'd get to the good stuff eventually.,What good stuff?


----------
Preprocessing
---

In [50]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
import nltk
import re
import string

stopwords = set(stopwords.words('english')) 
tokenizer = TweetTokenizer(strip_handles = True, reduce_len = True)
lemmatizer = nltk.stem.WordNetLemmatizer()

# Contraction dictionary
contraction_dict = {"i'll":"i will", "lemme":"let me", "i'm": "i am", "btw": "by the way", "ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have"}


def replace_words(doc, dictionary):
    ''' Replaces the words in a document with the dictionary's mappings '''
    new_doc = []
    for token in doc: 
        if token in dictionary: 
            token = dictionary[token]
        new_doc.append(token)
    return new_doc


def clean_punctuation_and_numbers(token):
    token = re.sub(r'\d', '', token)
    token = ''.join(char for char in token if char not in puncts)
    return re.sub(r'[^\w\s]', '', token)


puncts = ['','?', '....','..','...','','@','#', ',', '.', '"', ':', ')', '(', '-', '!', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '*', '+', '\\', 
    '•', '~', '£', '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', 
    '½', 'à', '…', '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', 
    '—', '‹', '─', '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 
    'Ã', '⋅', '‘', '∞', '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', 
    '¹', '≤', '‡', '√', '!','🅰','🅱']
        
# End to end tokenizer and preprocessing function
def tokenize_data(data):    

    output = []
    
    # Converting all to lower case
    data = data.str.lower()
    for doc in data:
        
        # Tokenize data
        doc = tokenizer.tokenize(doc)
        
        # Removing links
        doc = [re.sub(r"http\S+", "", token) for token in doc]
        doc = [re.sub(r"www.\S+", "", token) for token in doc]
        
        # Replacing contractions and slang from contraction dictionary
        doc = replace_words(doc, contraction_dict)

        # Removing the punctuation
        doc = [clean_punctuation_and_numbers(token) for token in doc if token not in string.punctuation and token not in puncts]

        # Removing empty strings
        doc = [token for token in doc if token != '']

        # Replacing contractions and slang again
        doc = replace_words(doc, contraction_dict)

        # Lemmatization
        doc = [lemmatizer.lemmatize(token) for token in doc]

        # Convert doc to one string
        doc = " ".join(doc)
        output.append(doc)
            
    return output


processed_text = tokenize_data(conversations_df['text'])
processed_response = tokenize_data(conversations_df['response'])
print(processed_text[:10])


['they do to', 'she okay', 'wow', 'no', 'i figured you would get to the good stuff eventually', 'me this endless blonde babble i am like boring myself', 'what crap', 'do you listen to this crap', 'but', 'then that is all you had to say']


In [51]:

# Concat processed_text and processed_response
processed_df = pd.DataFrame({"text": processed_text, "response": processed_response})
processed_df.head(15)


Unnamed: 0,text,response
0,they do to,they do not
1,she okay,i hope so
2,wow,let us go
3,no,okay youre gonna need to learn how to lie
4,i figured you would get to the good stuff even...,what good stuff
5,me this endless blonde babble i am like boring...,thank god if i had to hear one more story abou...
6,what crap,me this endless blonde babble i am like boring...
7,do you listen to this crap,what crap
8,but,you always been this selfish
9,then that is all you had to say,but


In [52]:

# Create counters to count appearances of every word
text_words = processed_df['text'].copy().tolist()
text_words = " ".join(text_words)
text_counts = Counter(text_words.split())

response_words = processed_df['response'].copy().tolist()
response_words = " ".join(response_words)
response_counts = Counter(response_words.split())

# Removes words that appear less than six times from corpus
def remove_rare_words(row):
    row_text = []
    for word in row['text'].split():
        if text_counts[word] >= 6:
            row_text.append(word)
    row['text'] = " ".join(row_text)

    row_response = []
    for word in row['response'].split():
        if response_counts[word] >= 6:
            row_response.append(word)
    row['response'] = " ".join(row_response)
    return row

# Remove rare words
filtered_processed_df = processed_df.apply(remove_rare_words, axis=1)
filtered_processed_df.head(15)

Unnamed: 0,text,response
0,they do to,they do not
1,she okay,i hope so
2,wow,let us go
3,no,okay youre gonna need to learn how to lie
4,i figured you would get to the good stuff even...,what good stuff
5,me this endless blonde babble i am like boring...,thank god if i had to hear one more story abou...
6,what crap,me this endless blonde babble i am like boring...
7,do you listen to this crap,what crap
8,but,you always been this selfish
9,then that is all you had to say,but


In [53]:

# Drop rows with empty text
filtered_processed_df = filtered_processed_df[filtered_processed_df['text'] != '']
filtered_processed_df = filtered_processed_df[filtered_processed_df['response'] != '']


In [54]:

new_column_names = {'text':'input_text', 'response':'response_output'}
filtered_processed_df.rename(columns=new_column_names, inplace=True)
data = filtered_processed_df.copy()

data['input_len'] = data['input_text'].apply(lambda x: len(x))
data['response_len'] = data['response_output'].apply(lambda x: len(x.split()))

# Take data with length 1 or more and 40 or less
data = data[(data['input_len'] > 0) & (data['input_len'] < 41)]
data = data[(data['response_len'] > 0) & (data['response_len'] < 41)]
data = data[['input_text', 'response_output']]
data

Unnamed: 0,input_text,response_output
0,they do to,they do not
1,she okay,i hope so
2,wow,let us go
3,no,okay youre gonna need to learn how to lie
6,what crap,me this endless blonde babble i am like boring...
...,...,...
1529802,good to be with you,thank you very much
1529803,thank you very much,former dni the firstever dni john negroponte w...
1529805,soundbite of laughter,central intelligence director of central intel...
1529807,unidentified person richard cordray,applause


In [55]:

# Add rows manually
manual_texts = ['hi how are you', 'hi', 'who are you', 'what is your name', 'hello']
manual_responses = ['hi i am good', 'hello', 'i am a chatbot', 'i do not have a name', 'hello there']

manual_rows = pd.DataFrame({ 'input_text': manual_texts, 'response_output': manual_responses})
manual_rows_duplicated = pd.concat([manual_rows] * 2, ignore_index=True)
manual_rows = pd.concat([manual_rows, manual_rows_duplicated], ignore_index=True)
manual_rows_duplicated = pd.concat([manual_rows] * 2, ignore_index=True)
manual_rows = pd.concat([manual_rows, manual_rows_duplicated], ignore_index=True)
manual_rows_duplicated = pd.concat([manual_rows] * 2, ignore_index=True)
manual_rows = pd.concat([manual_rows, manual_rows_duplicated], ignore_index=True)

# Concat with main df
data = pd.concat([data, manual_rows], axis=0)

data

Unnamed: 0,input_text,response_output
0,they do to,they do not
1,she okay,i hope so
2,wow,let us go
3,no,okay youre gonna need to learn how to lie
6,what crap,me this endless blonde babble i am like boring...
...,...,...
130,hi how are you,hi i am good
131,hi,hello
132,who are you,i am a chatbot
133,what is your name,i do not have a name


In [56]:
import pickle 

print(f'final length of data: {len(data)}')

# Save data to pickle file
with open('data/input_data.pkl', 'wb') as file:
    pickle.dump(data, file)
    

final length of data: 450680
