In [1]:
import pandas as pd
import numpy as np
import re

In [3]:
# Preprocess dataset
def clean_str(string):
    """
    Tokenization/string cleaning
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    
    return string.strip().lower()


def load_data_and_labels(csv_file, text_col, target_col):
    """
    Loads IMDb plot data from csv, splits the data into words and generates labels.
    Returns split sentences and labels.
    """
    # Load CSV file, indicate that the last column represents labels
    df = pd.read_csv(csv_file)
    features = df[text_col]
    labels = df[target_col]

    # Split by words
    x_text = [clean_str(str(sent)) for sent in features]
    
    # Generate labels as [1, 0] for oscar win, [0, 1] otherwise
    y = [[1, 0] if row == 1 else [0, 1] for row in labels]
    y = np.array(y)
    
    return [x_text, y]

x_text, y = load_data_and_labels('movies_plot.csv', 'Plot Summary', 'Won')

print x_text[0], y[0]

the film opens with the rabid ranting and raving of arthur eden 's \( tom wilkinson \) most of this speech is rambling and babbling he begins talking about how one day he was leaving work and he walked out into the street into traffic and suddenly he found he could not move , even though there was traffic incoming essentially he tells us he was frozen in a moment of utter consternation brought on by the shambles he has made of his life doing what it is that he does his work as an attorney helping a corrupt company defend the indefensible has left him where must break the shackles with his old life and try something new in an office a legal team of dozens of people are diligently taking phone calls , shedding documents , comparing notes , etc a lawyer talking to a reporter on the phone and walks over to marty bach \( sydney pollack \) and hands him the phone he tells bach that it 's a reporter from the wall street journal inquiring about the settlement bach looks infuriated but takes th

In [4]:
# Build vocabulary
from tensorflow.contrib import learn

max_document_length = max([len(x.split(" ")) for x in x_text])
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
x = np.array(list(vocab_processor.fit_transform(x_text)))
print("Vocabulary size: {:d}".format(len(vocab_processor.vocabulary_)))

Vocabulary size: 15299


In [5]:
# Randomly shuffle data
np.random.seed(42)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

# Saves preprocessed data into a pickle file
import pickle

with open('preprocess_x_2.pickle', 'wb') as handle:
    pickle.dump(x_shuffled, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('preprocess_y_2.pickle', 'wb') as handle:
    pickle.dump(y_shuffled, handle, protocol=pickle.HIGHEST_PROTOCOL)

print "Files saved."

Files saved.


In [7]:
# Preprocess test dataset

def preprocess_test_set(csv_file, text_col):
    # Load csv test file
    df = pd.read_csv(csv_file)
    features_test = df[text_col]
    
    # Split by words
    x_text_test = [clean_str(sent) for sent in features_test]
    
    # Convert text to lower representations
    x = np.array(list(vocab_processor.transform(x_text_test)))
    return x

x = preprocess_test_set('movies_plot_pred.csv', 'Plot Summary')

with open('preprocess_test_2.pickle', 'wb') as handle:
    pickle.dump(x, handle, protocol=pickle.HIGHEST_PROTOCOL)

print "File saved."

File saved.
