# Cornell

In [3]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

home = '../../../data/cornell'
os.chdir(home)

## Reference

- [#6](https://github.com/at15/snowbot/issues/6) exporle [Cornell Move Corpus](https://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html)
- kaggle seems to have a (non-official) [cleaner version](https://www.kaggle.com/Cornell-University/movie-dialog-corpus) with detail explanation
- [Currie32/Chatbot-from-Movie-Dialogue](https://github.com/Currie32/Chatbot-from-Movie-Dialogue) and [stanford cs20si/chatbot](https://github.com/chiphuyen/stanford-tensorflow-tutorials/blob/master/assignments/chatbot/data.py) show how to preprocess this data.
- [b0noI/dialog_converter](https://github.com/b0noI/dialog_converter) mentioned some dialog are bad, it is used by [blog: building a chatbot in RNN for 6 hrs](https://blog.kovalevskyi.com/rnn-based-chatbot-for-6-hours-b847d2d92c43)

## Observation

- the raw data is separated by '+++$+++', see the python file for how it is convert to csv
- Conversion is, user1, user2, movie, lines [u1, u2, u1, u2] NOTE: it may not be an even number, some pepople just [ignore it for one turn QA](https://github.com/suriyadeepan/datasets/blob/master/seq2seq/cornell_movie_corpus/scripts/prepare_data.py)

In [4]:
ls *.csv

movie_characters_metadata.csv  movie_lines.csv
movie_conversations.csv        movie_titles_metadata.csv


In [5]:
convs = pd.read_csv('movie_conversations.csv')
lines = pd.read_csv('movie_lines.csv')
print(convs.head())
print(lines.head())

  character_id_1 character_id_2 movie_id                           lines
0             u0             u2       m0  'L194'; 'L195'; 'L196'; 'L197'
1             u0             u2       m0                  'L198'; 'L199'
2             u0             u2       m0  'L200'; 'L201'; 'L202'; 'L203'
3             u0             u2       m0          'L204'; 'L205'; 'L206'
4             u0             u2       m0                  'L207'; 'L208'
      id character_id movie_id character_name     utterance
0  L1045           u0       m0         BIANCA  They do not!
1  L1044           u2       m0        CAMERON   They do to!
2   L985           u0       m0         BIANCA    I hope so.
3   L984           u2       m0        CAMERON     She okay?
4   L925           u0       m0         BIANCA     Let's go.


In [10]:
convs_lines = []
convs_u = []
for u1, u2, conv in zip(convs['character_id_1'], convs['character_id_2'], convs['lines']):
    # str 'L1'; 'L2' -> list [L1, L2]
    conv_lines = [l.strip()[1:-1] for l in conv.split(';')]
    conv_u = [u1, u2]
    convs_lines.append(conv_lines)
    convs_u.append(conv_u)

lines = lines.fillna('')
id2line = {}
id2u = {}
for line_id, u, line in zip(lines['id'], lines['character_id'], lines['utterance']):
    id2line[line_id] = line
    id2u[line_id] = u

In [15]:
# check if our assumption is wring
n_qa_total = 0
n_qa_same_character = 0
n_qa_empty = 0
for conv in convs_lines:
    for i in range(len(conv) - 1):
        line_q = conv[i]
        line_a = conv[i+1]
        n_qa_total += 1
        if id2u[line_q] == id2u[line_a]:
            n_qa_same_character += 1
            # print('same character in adjacent lines', line_q, line_a)
        if not id2line[line_q] or not id2line[line_a]:
            n_qa_empty += 1
print('total', n_qa_total)
print('same character', n_qa_same_character, n_qa_same_character / n_qa_total)
print('empty', n_qa_empty, n_qa_empty / n_qa_total)

total 221616
same character 1037 0.004679265035015523
empty 334 0.0015071113998989243


## Playground

In [4]:
cleaner =  re.compile('(<u>|</u>|\[|\])')
cleaner.sub('', 'I am <u>a</u> fan of [mr bruce]')

'I am a fan of mr bruce'