In [330]:
import re
import pandas as pd
import numpy as np

from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

pd.options.display.max_columns = 999

### Load Data

In [431]:
ama = pd.read_csv('./data/amazon.csv')
rt = pd.read_csv('./data/rotten_tomatoes.csv')

holdout = pd.read_csv('./data/holdout.csv')
test = pd.read_csv('./data/test.csv')
train = pd.read_csv('./data/train.csv')

In [432]:
ama.columns = [
    'id_left',
    'time_left',
    'director_left',
    'star_left',
    'cost_left'
]

rt.columns = [
    'id_right',
    'time_right',
    'director_right',
    'year_right',
    'star1_right',
    'star2_right',
    'star3_right',
    'star4_right',
    'star5_right',
    'star6_right',
    'rotten_tomatoes_right',
    'audience_rating_right',
    'review1_right',
    'review2_right',
    'review3_right',
    'review4_right',
    'review5_right'
]

### Merge records from ama and rt

In [726]:
def create_features(dataset):
    new_data = pd.DataFrame(dtype=str)
    
    for index, row in dataset.iterrows():
        amarow = ama[ama['id_left'] == row['id1']]
        rtrow = rt[rt['id_right'] == row['id 2']]

        amarow.reset_index(drop=True, inplace=True)
        rtrow.reset_index(drop=True, inplace=True)

        new_row = pd.concat([amarow, rtrow], axis=1)
        new_data = pd.concat([new_data, new_row])
    
    new_data.fillna('0', inplace = True)
    new_data.dropna()
    
    # Compute directors match column
    new_data['directors_same'] = (new_data['director_left'] == new_data['director_right']).astype(int)
    
    # Compute time columns
    new_data['time_left'] = new_data['time_left'].astype(str)
    new_data['time_right'] = new_data['time_right'].astype(str)

    new_data['time_norm_left'] = new_data['time_left'].apply(compute_time_norm)
    new_data['time_norm_right'] = new_data['time_right'].apply(compute_time_norm)
    new_data['time_same'] = (new_data['time_norm_left'].astype(int) == new_data['time_norm_right'].astype(int)).astype(int)
    new_data['time_diff'] = (new_data['time_norm_left'].astype(int) - new_data['time_norm_right'].astype(int)).astype(int)
    
    # Compute actors columns
    actors_split = new_data['star_left'].str.split(', ', expand=True)
    actors_split_columns = ['star_' + str(i) for i in range(len(actors_split.columns))]
    actors_split.columns = actors_split_columns
    new_data = pd.concat([new_data, actors_split], axis=1)
    
    # Todo - fix for > 2 stars_left
    cols = list(new_data.loc[:,'star_0':'star_1']) + list(new_data.loc[:,'star1_right':'star6_right'])
    new_data['num_match_stars'] = new_data[cols].apply(compute_number_stars_match, axis = 1)
    
    return new_data

In [725]:
train_data_copy = create_features(train)

actors_split = train_data_copy['star_left'].str.split(', ', expand=True)
actors_split_columns = ['star_' + str(i) for i in range(len(actors_split.columns))]
actors_split.columns = actors_split_columns
train_data_copy = pd.concat([train_data_copy, actors_split], axis=1)

In [724]:
def compute_number_stars_match(row):
    actors_left = ['star_0', 'star_1']
    actors_right = ['star1_right', 'star2_right', 'star3_right', 'star4_right', 'star5_right', 'star6_right']
    list_left = row.loc[actors_left].tolist()
    list_right = row.loc[actors_right].tolist()
    x = len(np.intersect1d(list_left, list_right))
    
    return x

In [434]:
regex = re.compile(r'[0-9]*')

def compute_time_norm(row):
    row = str(row)
    match = regex.findall(row)
    temp = filter(None, match)
    if len(temp) == 2:
        return 60*int(temp[0]) + int(temp[1])
    if len(temp) == 1:
        return temp[0]

### Predict with train

In [743]:
features_cols = ['time_diff'] + ['directors_same'] + ['num_match_stars']

train_data = create_features(train)

train.reset_index(drop=True, inplace=True)
train_data.reset_index(drop=True, inplace=True)

train_data = pd.concat([train_data, train], axis=1) #, how='inner')

# Remove bad training rows
train_data = train_data[train_data['id_left'] != 199]
train_data = train_data[train_data['id_left'] != 680]

x_train, x_test, y_train, y_test = train_test_split(train_data[features_cols], train_data['gold'])

clf = RandomForestClassifier()
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

0.93548387096774188

In [744]:
time_cols = ['time_left', 'time_right', 'time_same', 'time_diff', 'time_norm_left', 'time_norm_right']

In [745]:
clf = RandomForestClassifier()
clf.fit(train_data[features_cols], train_data['gold'])

preds = clf.predict(train_data[features_cols])

# Missed preds:
errors = preds - train_data['gold']
error_ind = np.nonzero(errors)[0]
train_data.iloc[error_ind]

Unnamed: 0,id_left,time_left,director_left,star_left,cost_left,id_right,time_right,director_right,year_right,star1_right,star2_right,star3_right,star4_right,star5_right,star6_right,rotten_tomatoes_right,audience_rating_right,review1_right,review2_right,review3_right,review4_right,review5_right,directors_same,time_norm_left,time_norm_right,time_same,time_diff,star_0,star_1,star_2,star_3,star_4,num_match_stars,id1,id 2,gold
3,16,"2 hours, 9 minutes",Guy Ritchie,"Robert Downey Jr., Jude Law","Rent HD $3.99,Rent SD $2.99,Buy HD $17.99,Buy ...",2084,2 hr. 9 min.,Guy Ritchie,2009,Robert Downey Jr.,Jude Law,Rachel McAdams,Mark Strong (II),Eddie Marsan,Robert Maillet,70,77,There's plenty of fun and entertainment to be ...,"... Bears the stamp of its director, Guy Ritch...",The plot races and roars through a deliciously...,"Ritchie provides big dumb fun, but it's hard t...",Proof that Guy Richie can still make good film...,1,129,129,1,0,Robert Downey Jr.,Jude Law,,,,2,16,2084,0
60,457,120 minutes,Wes Anderson,Ralph Fiennes,$24.97,5564,1 hr. 39 min.,Wes Anderson,2014,Ralph Fiennes,F Murray Abraham,Edward Norton,Saoirse Ronan,Tilda Swinton,Bill Murray,92,86,"""The Grand Budapest Hotel"" is not his grandest...",The Grand Budapest Hotel is not a callous work...,Wes Anderson's superb eye for visual storytell...,A grownup storybook of a movie spun out of can...,Even a squirrelly director finds a nut once in...,1,120,99,0,21,Ralph Fiennes,,,,,1,457,5564,1
96,756,136 minutes,Michael Bay,Ewan McGregor,$9.14,777,2 hr. 16 min.,Michael Bay,2005,Ewan McGregor,Scarlett Johansson,Djimon Hounsou,Sean Bean,Steve Buscemi,Michael Clarke Dunca...,40,64,"As usual, Bay stages the action at a breakneck...",A grim example of the worst impulses of Hollyw...,"For the second half of the film, it's all chas...","The Island is very much a Michael Bay film, wi...",Only in a Bay movie can we sit for two hours w...,1,136,136,1,0,Ewan McGregor,,,,,1,756,777,0
97,770,87 minutes,Wes Anderson,George Clooney,$22.95,2080,1 hr. 27 min.,Wes Anderson,2009,George Clooney,Meryl Streep,Jason Schwartzman,Bill Murray,Wallace Wolodarsky,Eric Chase Anderson,92,84,"Visually, the movie is a wonder, with its prof...","Sometimes too clever by half, the film, replet...",A boisterous and magnificent treat.,If Fantastic Mr Fox feels like Anderson's fres...,"Fantastic Mr. Fox is gorgeous and fanciful, wi...",1,87,87,1,0,George Clooney,,,,,1,770,2080,0
164,1701,122 minutes,Sam Peckinpah,Steve McQueen,$12.99,552,2 hr. 2 min.,Sam Peckinpah,1972,Steve McQueen,Ali MacGraw,Al Lettieri,Sally Struthers,Ben Johnson,Slim Pickens,85,82,The action and the violence of The Getaway are...,It's like one of those devices for executive d...,0,0,Still the best action movie I've ever seen.,1,122,122,1,0,Steve McQueen,,,,,1,1701,552,0
178,1942,218 minutes,Paul Greengrass,Matt Damon,$10.99,1391,1 hr. 55 min.,Paul Greengrass,2007,Matt Damon,Julia Stiles,David Strathairn,Scott Glenn,Paddy Considine,Edgar Ramirez,93,91,Bourne -- or whatever his name really is -- is...,While the crunchy fights and unflagging pace e...,"As an action film, The Bourne Ultimatum is in ...",[Greengrass'] skillful use of handheld cameras...,"t's full of tension, but it's also disturbing ...",1,218,115,0,103,Matt Damon,,,,,1,1942,1391,0
180,1996,113 minutes,Neil Marshall,Rhona Mitra,$9.58,1559,1 hr. 49 min.,Neil Marshall,2008,Rhona Mitra,Bob Hoskins,Adrian Lester,Alexander Siddig,Malcolm McDowell,David O'Hara,50,44,Marshall's adrenalin-fuelled skill and enthusi...,...a Whitman's sampler of bizarre story ideas ...,"I somehow ended up admiring ""Doomsday"" for bei...","In terms of sheer excitement, it's the best mo...",As it slogs through one hectic yet mundane set...,1,113,109,0,4,Rhona Mitra,,,,,1,1996,1559,1


### Predict with test

In [746]:
features_cols = ['time_same'] + ['directors_same'] + ['num_match_stars']
test_data = create_features(test)

clf = RandomForestClassifier()
clf.fit(train_data[features_cols], train_data['gold'])
test_preds = clf.predict(test_data[features_cols])

In [747]:
test_preds = pd.DataFrame(test_preds)
test_preds.columns = ['gold']
test_preds.to_csv('test_gold.csv', index=False)

### Predict with Holdout

In [748]:
features_cols = ['time_same'] + ['directors_same'] + ['num_match_stars']
holdout_data = create_features(holdout)

clf = RandomForestClassifier()
clf.fit(train_data[features_cols], train_data['gold'])
holdout_preds = clf.predict(holdout_data[features_cols])

In [749]:
holdout_preds = pd.DataFrame(holdout_preds)
holdout_preds.columns = ['gold']
holdout_preds.to_csv('holdout_gold.csv', index=False)