In [10]:
# import libraries
% matplotlib inline
import numpy as np
import pandas as pd
import io
import math
import string
import statistics
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from functools import partial
from operator import is_not
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegressionCV
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier
pd.options.mode.chained_assignment = None
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_fscore_support as score

def preprocessing(df1, df2):
    df1 = df1.drop(df1.columns[[4]], axis=1)
    df1.loc[df1.star.str[0].str.isdigit() == True,'time'] = df1[df1.star.str[0].str.isdigit() == True]['star']
    df1.loc[df1.star.str[0].str.isdigit() == True,'star'] = 'None'
    df_temp = df1['star'].str.split(',', expand=True)
    frames = [df1, df_temp]
    df1 =  pd.concat(frames,axis = 1)
    df1 =  df1.drop(df1.columns[[3]], axis=1)
    df1 = df1.rename(columns={0: 'star1', 1: 'star2',2: 'star3',3: 'star4',4: 'star5'})
    df1['id'] = df1['id'].astype(int)
    df2['id'] = df2['id'].astype(int)
    df2 = df2.drop(df2.columns[[3,10,11,12,13,14,15,16]], axis=1)
    df2 = df2.where((pd.notnull(df2)), None)
    temp2 = df2['time'].str.split('.', expand=True)
    temp2_hour = temp2.loc[temp2[0].str.contains('hr') == True,0].str.extract('(\d+)',expand = False).astype(int)* 60
    temp2_min = temp2.loc[temp2[0].str.contains('min') == True,0].str.extract('(\d+)',expand = False).astype(int)
    temp2[1] = temp2[1].str.extract('(\d+)',expand = False)
    temp2.loc[temp2[1].isnull(),1] = 0
    temp2[1] = temp2[1].astype(int)
    temp2[1] = (temp2_hour.reindex_like(temp2[1]).fillna(0) + temp2[1].fillna(0)).fillna(0)
    temp2[1] = (temp2_min.reindex_like(temp2[1]).fillna(0) + temp2[1].fillna(0)).fillna(0)
    df2['time'] = temp2[1].astype(int)
    temp1 = df1['time'].str.split(',', expand=True)
    temp1[1] = temp1[1].str.extract('(\d+)',expand = False)
    temp1[temp1.isnull()] = 0
    temp1[1] = temp1[1].astype(int)
    temp1_hour = temp1.loc[temp1[0].str.contains('hour') == True,0].str.extract('(\d+)',expand = False).astype(int)* 60
    temp1_min = temp1.loc[temp1[0].str.contains('minute') == True,0].str.extract('(\d+)',expand = False).astype(int)
    temp1[1] = (temp1_hour.reindex_like(temp1[1]).fillna(0) + temp1[1].fillna(0)).fillna(0)
    temp1[1] = (temp1_min.reindex_like(temp1[1]).fillna(0) + temp1[1].fillna(0)).fillna(0)
    df1['time'] = temp1[1].astype(int)
    return df1, df2

def create_tidy_set(df1, df2, data):
    df1 = df1.set_index(['id'])
    df2 = df2.set_index(['id'])
    temp1 = df1.loc[data['id1']]
    temp2 = df2.loc[data['id 2']]
    temp1.index = list(range(temp1.shape[0]))
    temp2.index = list(range(temp2.shape[0]))
    train_time = abs(temp1['time'] - temp2['time'])
    train_director_score = pd.Series()
    train_stars_score = pd.Series()    
    for i in range(temp1.shape[0]):
        train_director_score = train_director_score.set_value(i, fuzz.ratio(temp1['director'][i], temp2['director'][i]))
        stars_1 = [x for x in list(temp1.ix[i,2:7]) if x is not None and x is not np.nan]
        stars_2 = list(temp2.ix[i,2:8])
        temp = list()
        if(len(stars_1) == 0 or len(stars_2) == 0):
            train_stars_score = train_stars_score.set_value(i,0)
        else:
            for j in stars_1:
                value = process.extractOne(j, stars_2)[1]
                temp.insert(0,value)
            if(len(temp) == 1):
                temp.insert(0,45)
            train_stars_score = train_stars_score.set_value(i,((math.pow(float(max(temp)) / 100,2) ) + float(min(temp)) / 100) * 50)
#            train_stars_score = train_stars_score.set_value(i,temp)
    X = pd.concat([train_time, train_director_score, train_stars_score], axis = 1)
    X.columns = ['train_time', 'train_director_score','train_stars_score']
    return X

def create_classifier(X_train, y_train, X_test, y_test):
    pipe = make_pipeline(RobustScaler(), AdaBoostClassifier())
    pipe.fit(X_train, y_train)
    y_predict = pipe.predict(X_test)
    precision, recall, fscore, support = score(y_test, y_predict)
    print('precision: {}'.format(precision))
    print('recall: {}'.format(recall))
    print('fscore: {}'.format(fscore))
    print('support: {}'.format(support))
    return pipe

def main():
    # Load data
    df1 = pd.read_csv('amazon.csv')
    df2 = pd.read_csv('rotten_tomatoes.csv',encoding='ISO-8859-1')
    train = pd.read_csv('train.csv')
    test = pd.read_csv('test.csv')
    holdout = pd.read_csv('holdout.csv')

    # Main
    df1, df2 = preprocessing(df1, df2)
    X_train = create_tidy_set(df1, df2, train)
    X_cross = create_tidy_set(df1, df2, test)
    X_holdout = create_tidy_set(df1, df2, holdout)
    y_train = train['gold']
    X_train, X_test, y_train, y_test = train_test_split(X_train, y_train)
    pipe = create_classifier(X_train, y_train, X_test, y_test)
    y_cross = pipe.predict(X_cross)
    y_holdout = pipe.predict(X_holdout)
    y_cross = pd.DataFrame(y_cross)
    y_holdout = pd.DataFrame(y_holdout)
    y_cross.columns = ['gold']
    y_holdout.columns = ['gold']
    y_cross.to_csv('y_cross.csv', index=False)
    y_holdout.to_csv('gold.csv', index=False)

if __name__== "__main__":
    main()

precision: [ 0.98275862  1.        ]
recall: [ 1.          0.83333333]
fscore: [ 0.99130435  0.90909091]
support: [57  6]
