In [1]:
import pandas as pd
import string
import re
import nltk

nltk.download('wordnet');

[nltk_data] Error loading wordnet: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>


In [2]:
data_folder = './cornell-movie-dialogs-corpus'

with open('{}/movie_lines.txt'.format(data_folder), 'rb') as movie_lines_file:
    lines_data = movie_lines_file.read().decode(encoding='utf-8', errors='ignore')

### Data Gathering

<b>1.</b> First, we split text by endline symbol (`\n`) to get list of utterances.

In [3]:
utterances = lines_data.split(sep='\n')
print('Last line ("{}") is an empty string, so we remove it from list of utterances.'.format(utterances.pop()))
print('Number of utterances:', len(utterances))

Last line ("") is an empty string, so we remove it from list of utterances.
Number of utterances: 304713


<b>2.</b> Observations on first 10 samples suggest the string ` +++$+++ ` acts as seperator between 5 components of an utterance.

In [4]:
print('> First 10 samples:')
print(*utterances[:10], sep='\n')

print()

print('> Is there a line which does not have exactly 4 "+++$+++"?')
print('- {}'.format(any([line.count(' +++$+++ ') != 4 for line in utterances])))

> First 10 samples:
L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!
L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!
L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.
L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?
L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go.
L924 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ Wow
L872 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Okay -- you're gonna need to learn how to lie.
L871 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ No
L870 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I'm kidding.  You know how sometimes you just become this "persona"?  And you don't know how to quit?
L869 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Like my fear of wearing pastels?

> Is there a line which does not have exactly 4 "+++$+++"?
- False


<b>3.</b> We then split each utterance by the seperator and convert the whole list to dataframe. Each column in dataframe is renamed regarding its meanings provided in `README.txt`.

In [5]:
utterances_data = pd.DataFrame([line.split(' +++$+++ ') for line in utterances],
                               columns=['lineID', 'characterID', 'movieID', 'chacterter_name', 'text'])

In [6]:
print('Number of characters:', utterances_data['characterID'].nunique())
print('Number of movies:', utterances_data['movieID'].nunique())
print()
utterances_data.sample(5)

Number of characters: 9035
Number of movies: 617



Unnamed: 0,lineID,characterID,movieID,chacterter_name,text
105836,L569571,u3129,m205,BETSY,All right. All right. I'm taking a break at f...
288463,L622567,u8570,m581,ROBERT,Drugs begin pouring out of America into every ...
261712,L523606,u7741,m524,EARL,I don't look like no possum!
241586,L452534,u7158,m481,BIALYSTOCK,"Okay, take it off, take it off."
174200,L184673,u5062,m334,THE PATIENT,I see.


### Data Cleaning

<b>1.</b> We define normal characters consisting of alphabetic letters and basic sentence punctuations. Others are considered as special characters.

In [7]:
characters = set()
utterances_data['text'].apply(lambda text : characters.update(list(text)));

# Set of special characters is the intersection
# between set of letters appeared in the dataset and alphabet
special_characters = characters.difference(string.ascii_letters + '.,!? \'')    

print('List of {} special characters:'.format(len(special_characters)))
print(*special_characters, sep=', ')

List of 35 special characters:
	, 6, `, |, ", 7, ;, -, %, 5, <, 3, +, &, 1, ], $, >, :, _, }, 2, =, /, *, 0, ), 9, 8, #, ~, {, 4, ^, [


<b>2.</b> Remove special characters from text of utterances.

In [8]:
remove_special_characters = lambda text : ''.join([c for c in text if c not in special_characters])
utterances_data['text'] = utterances_data['text'].apply(remove_special_characters)
utterances_data.sample(5)

Unnamed: 0,lineID,characterID,movieID,chacterter_name,text
133350,L65567,u3865,m256,BARTON,"Well, my pleasure. I could use a little lift m..."
112808,L619257,u3321,m220,MAVERICK,I'm fine.
202350,L294088,u5962,m395,BUZZ,I like to think so! It's this little idea I be...
70993,L389998,u2109,m136,MASSERIA,The kid just called me stupid.
244968,L470209,u7268,m491,CLEANER,This a con?


<b>3.</b> Lower casing, lemmatization and repeat removing are utilized to normalize utterance tokens.

In [9]:
from nltk.corpus import wordnet

# Create a simple repeat replacer using regex
class RepeatReplacer():
    def __init__(self):
        self.pattern = re.compile(r'(\w*)(\w)\2(\w*)')
        self.repl = r'\1\2\3'
    
    def replace(self, word):
        if wordnet.synsets(word):
            return word
        repl_word = self.pattern.sub(self.repl, word)
        
        # Recursively replace until it can't be replaced anymore
        return repl_word if repl_word == word else self.replace(repl_word)

RepeatReplacer().replace('aaaaaaaaaaaaaaarghhhh')

'argh'

In [10]:
from nltk.stem import SnowballStemmer, WordNetLemmatizer

replacer = RepeatReplacer()
stemmer = SnowballStemmer(language='english')
lemmatizer = WordNetLemmatizer()

preprocess = lambda text : ' '.join([lemmatizer.lemmatize(stemmer.stem(replacer.replace(
                                    word.lower().strip(string.punctuation))))
                                    for word in text.split()])

sample_text = utterances_data['text'].sample().values[0]
print('Sample text:', sample_text)
print('After tokenized:', preprocess(sample_text))

Sample text: What do you want from me?
After tokenized: what do you want from me


In [11]:
utterances_data['text'] = utterances_data['text'].apply(preprocess)
utterances_data.sample(5)

Unnamed: 0,lineID,characterID,movieID,chacterter_name,text
44749,L282310,u1327,m89,CONOR,you'r no match for scot mr romirez we'r rais a...
104584,L593013,u3111,m203,SOLLOZZO,i need two million dollar in cash...mor import...
272064,L560627,u8025,m544,ALBERT,he thirtyf year old
155846,L133909,u4504,m298,EMILY,charl your your break this man neck would scar...
1286,L3044,u41,m2,NICOLETTE,okay


In [12]:
words = set()
utterances_data['text'].str.split().apply(words.update)
print('Number of distinct words:', len(words))

Number of distinct words: 47131


### Word Embedding

#### Bag of Words

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer()
bag = count_vectorizer.fit_transform(utterances_data['text'].values)

In [14]:
bag.shape

(304713, 41554)