In [1]:
import csv
import re
import nltk
import random
import nltk
import pandas as pd
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
wake_words = ['tell',
              'notify',
              'ask',
              'inform',
              'message',
              'text',
              'reply to',
              'mention to',
             ]


In [3]:
file = open("./cornell_movie-dialogs_corpus/movie_lines.txt", "rb") 
#file = open("./sample.txt", "rb")
contacts, messages = [], []
for line in file:
    line = line.decode('utf-8', errors='ignore').strip('\n')
    split_line = line.split("+++$+++")
    recipent = split_line[3].strip()
    sentences = nltk.sent_tokenize(split_line[-1].strip())
    #print(split_line)
    if len(recipent) < 1:
        continue
    else:
        contacts.append(recipent[0]+recipent[1:].lower())
    messages.extend(sentences)

In [4]:
contacts = list(set(contacts))
random.shuffle(messages)

In [5]:
new_contacts = []
for contact in contacts:
    if '\t\t\t' in contact:
        continue
    if ']' in contact:
        continue
    new_contacts.append(' '.join(contact.split()))
contacts = new_contacts    

In [6]:
num_messages = len(messages)
i = 0
dataset = []
recipents = []

while i < num_messages:
    # Wake words
    wake_word = random.sample(wake_words, 1)[0]
    if random.random() < 0.5:
        command = "Can you "\
                  + wake_word\
                  + " "              
    else:
        command = wake_word[0].upper()\
                  + wake_word[1:]\
                  + " "
            
    # Recipent
    contact = random.sample(contacts, 1)[0]
    command += contact
    recipents.append(contact)
    
    # Message(s)
    if random.random() < 0.5:
        command += ' that'
        
    command += " " + messages[i]
    i += 1
    
    if random.random() < 0.25 and i < num_messages:
        command += " and that "\
                   + messages[i]
        i += 1
        
    dataset.append(command)
            

In [7]:
contacts_dict = defaultdict(list)

for contact in contacts:
    contact_split = contact.split()
    contacts_dict[contact_split[0].lower()].append(contact)


In [8]:
X_train, X_test, y_train, y_test = train_test_split(dataset, recipents, test_size=0.2)

In [16]:
import pickle
pickle.dump(dataset, open('command_dataset.pickle', 'wb'))
pickle.dump(recipents, open('recipents_dataset.pickle', 'wb'))

In [9]:
def find_recipent_rule_based(command):
    
    wake_word_said = False
    words = command.split()
    i = 0
    
    # only searching for recipent if wake word is said
    while i < len(words):
        if words[i].lower() in wake_words:
            i += 1
            wake_word_said = True
            break
        elif ' '.join(words[i:i+2]).lower() in wake_words:
            i += 2
            wake_word_said = True
            break
        i += 1
        
    # rearching for first word of recipent name
    if wake_word_said:
        while words[i].lower() not in contacts_dict and i < len(words):
            i += 1
            
    # when found, find the whole recipent name
    if i < len(words):
        if words[i].lower() in contacts_dict:
            sub_sentence = ' '.join(words[i:])
            max_matching_len = 0

            for name in contacts_dict[words[i].lower()]:
                if name in sub_sentence and len(name)>max_matching_len:
                    max_matching_len = len(name)
                    recipent = name
            try:
                return recipent
            except:
                print(contacts_dict[words[i].lower()])

        i += 1
    return None

In [10]:
y_pred_rb = []
for test in X_test:
    y_pred_rb.append(find_recipent_rule_based(test))

correct = 0
for y, y_bar in zip(y_test, y_pred_rb):
    correct += int(y==y_bar)
    if y != y_bar:
        print(y, y_bar)

correct/len(y_test)

1.0

In [11]:
def extract_feature_label(command, label, n):

    words = nltk.word_tokenize(command)
    words=[word.lower() for word in words if word.isalpha()]
    features = []
    labels = []

    num_words = len(words)
    for i, word in enumerate(words):
        feature = []
        for j in range(n, 0, -1):
            k = i-j
            if k >= 0:
                feature.append(words[k])
            else:
                feature.append('NA')
        for j in range(1, n+1):
            k = i+j
            if k < num_words:
                feature.append(words[k])
            else:
                feature.append('NA')
        feature.append(i)
        features.append(feature)

        if word in label.lower().split():
            labels.append(1)
        else:
            labels.append(0)
    return features, labels
        


In [12]:
n_neighbor = 1
features, labels = extract_feature_label(X_train[2], y_train[2], n_neighbor)

X_train_word_token, y_train_word_token = [], []
for command, recipent in zip(X_train, y_train):
    features, labels = extract_feature_label(command, recipent,1)
    X_train_word_token += features
    y_train_word_token += labels

ratio = sum(y_train_word_token)/len(y_train_word_token)

train_word_token_blc = []
for feature, label in zip(X_train_word_token, y_train_word_token):
    if label == 0 and random.random() > ratio:
        continue
    train_word_token_blc.append(feature+[label])
        


In [13]:
fwd, back = [], []
for i in range(n_neighbor):
    fwd.append('-'+str(i+1)+'_loc')
    back.append('+'+str(i+1)+'_loc')
fwd.reverse()
col_names = fwd+back+['loc','label']

In [14]:
train_df = pd.DataFrame(train_word_token_blc, columns=col_names)

In [15]:
train_df.head()

Unnamed: 0,-1_loc,+1_loc,loc,label
0,text,that,3,1
1,know,that,10,0
2,notify,all,1,1
3,all,chicks,3,0
4,ai,no,16,0


In [19]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
stop_words = ENGLISH_STOP_WORDS.union('NA')

from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]
    

CV = CountVectorizer(stop_words=stop_words, tokenizer=LemmaTokenizer())
X_fwd = CV.fit_transform(train_df['-1_loc'])
X_bck = CV.fit_transform(train_df['+1_loc'])

In [None]:
import numpy as np
foo = np.concatenate((X_fwd.toarray(), X_bck.toarray()), axis=1)

In [23]:
X_fwd.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [21]:
X_bck.shape

(745517, 17398)