In [0]:
import pandas as pd
import string
import re
import nltk

In [2]:
# Download WordNet and stopwords
nltk.download('wordnet');
nltk.download('stopwords');

# Download data
DATA_LINK = "http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip"
FILE_NAME = 'cornell_movie_dialogs_corpus.zip'
!test -f $FILE_NAME || (wget -q $DATA_LINK && unzip -q $FILE_NAME && rm -rf $FILE_NAME)

# Read data
data_folder = "./cornell movie-dialogs corpus"
with open('{}/movie_lines.txt'.format(data_folder), 'rb') as movie_lines_file:
    lines_data = movie_lines_file.read().decode(encoding='utf-8', errors='ignore')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### Data Gathering

<b>1.</b> First, we split text by endline symbol (`\n`) to get list of utterances.

In [3]:
utterances = lines_data.split(sep='\n')
print('Last line ("{}") is an empty string, so we remove it from list of utterances.'.format(utterances.pop()))
print('Number of utterances:', len(utterances))

Last line ("") is an empty string, so we remove it from list of utterances.
Number of utterances: 304713


<b>2.</b> Observations on first 10 samples suggest the string ` +++$+++ ` acts as seperator between 5 components of an utterance.

In [4]:
print('> First 10 samples:')
print(*utterances[:10], sep='\n')

print()

print('> Is there a line which does not have exactly 4 "+++$+++"?')
print('- {}'.format(any([line.count(' +++$+++ ') != 4 for line in utterances])))

> First 10 samples:
L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!
L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!
L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.
L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?
L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go.
L924 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ Wow
L872 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Okay -- you're gonna need to learn how to lie.
L871 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ No
L870 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I'm kidding.  You know how sometimes you just become this "persona"?  And you don't know how to quit?
L869 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Like my fear of wearing pastels?

> Is there a line which does not have exactly 4 "+++$+++"?
- False


<b>3.</b> We then split each utterance by the seperator and convert the whole list to dataframe. Each column in dataframe is renamed regarding its meanings provided in `README.txt`.

In [0]:
utterances_data = pd.DataFrame([line.split(' +++$+++ ') for line in utterances],
                               columns=['lineID', 'characterID', 'movieID', 'chacterter_name', 'text'])

In [6]:
print('Number of characters:', utterances_data['characterID'].nunique())
print('Number of movies:', utterances_data['movieID'].nunique())
print()
utterances_data.sample(5)

Number of characters: 9035
Number of movies: 617



Unnamed: 0,lineID,characterID,movieID,chacterter_name,text
192570,L249908,u5621,m373,WILL,We'll do that.
22783,L141236,u722,m45,SALESGIRL,That's no problem. We have those in stock.
101266,L537949,u3020,m197,SPOCK,You must have faith.
29214,L206559,u910,m59,LINDA,I hear some surfer pulled a knife on Mr. Hand ...
287259,L618420,u8500,m577,BOND,Trust me.


### Data Cleaning

<b>1.</b> We define normal characters consisting of alphabetic letters and basic sentence punctuations. Others are considered as special characters.

In [7]:
characters = set()
utterances_data['text'].apply(lambda text : characters.update(list(text)));

# Set of special characters is the intersection
# between set of letters appeared in the dataset and normal characters
special_characters = characters.difference(string.ascii_letters + '.,!? \'')    

print('List of {} special characters:'.format(len(special_characters)))
print(*special_characters, sep=', ')

List of 35 special characters:
_, &, 9, 7, $, ^, |, `, %, <, 4, 6, #, +, ), 3, [, -, ~, }, =, *, 0, 5, :, >, 	, ;, 1, ", {, 2, ], /, 8


<b>2.</b> Remove special characters from text of utterances.

In [8]:
preprocess = lambda text : ''.join([c for c in text if c not in special_characters])
sample_text = utterances_data['text'].sample().values[0]

print('Sample text:', sample_text)
print('After preprocessed:', preprocess(sample_text))

Sample text: -- understand what you're asking for here. The Massey pre-nup provides that in the event of a dissolution of the marriage for any reason, both parties shall leave it with whatever they brought in, and earned during. No one can profit from the marriage. The pre-nup protects the wealthier party.
After preprocessed:  understand what you're asking for here. The Massey prenup provides that in the event of a dissolution of the marriage for any reason, both parties shall leave it with whatever they brought in, and earned during. No one can profit from the marriage. The prenup protects the wealthier party.


<b>3.</b> Lower casing, lemmatization and repeat removing are utilized to normalize utterance tokens.

In [9]:
from nltk.corpus import wordnet

# Create a simple RepeatReplacer using regex
class RepeatReplacer():
    def __init__(self):
        self.pattern = re.compile(r'(\w*)(\w)\2(\w*)')
        self.repl = r'\1\2\3'
    
    def replace(self, word):
        if wordnet.synsets(word):
            return word
        repl_word = self.pattern.sub(self.repl, word)
        
        # Recursively replace until it can't be replaced anymore
        return repl_word if repl_word == word else self.replace(repl_word)

# Testing RepeatReplacer object
RepeatReplacer().replace('aaaaaaaaaaaaaaarghhhh')

'argh'

In [15]:
from nltk.stem import SnowballStemmer, WordNetLemmatizer

replacer = RepeatReplacer()
stemmer = SnowballStemmer(language='english')
lemmatizer = WordNetLemmatizer()

tokenize = lambda text : [lemmatizer.lemmatize(stemmer.stem(replacer.replace(
                          word.strip(string.punctuation))))
                          for word in text.split()]

sample_text = utterances_data['text'].sample().values[0]
print('Sample text:', sample_text)
print('After tokenized:', tokenize(sample_text))

Sample text: Is this one art deco or art nouveau?
After tokenized: ['is', 'this', 'one', 'art', 'deco', 'or', 'art', 'nouveau']


### Word Embedding

In [0]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

#### Bag of Words

In [16]:
# Naive BoW
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(preprocessor=preprocess,
                                   tokenizer=tokenize,
                                   stop_words=stop_words)
bow_vec = count_vectorizer.fit_transform(utterances_data['text'].values)

print('Shape:', bow_vec.shape)
print(*bow_vec[0].toarray())

  'stop_words.' % sorted(inconsistent))


Shape: (304713, 47101)
[0 0 0 ... 0 0 0]


In [17]:
# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(preprocessor=preprocess,
                                   tokenizer=tokenize,
                                   stop_words=stop_words)
tfidf_vec = tfidf_vectorizer.fit_transform(utterances_data['text'].values)

print('Shape:', tfidf_vec.shape)
print(*tfidf_vec[0].toarray())

  'stop_words.' % sorted(inconsistent))


Shape: (304713, 47101)
[0. 0. 0. ... 0. 0. 0.]
