In [1]:
import os
import pandas as pd
import numpy as np
from math import sqrt
import datetime
from datetime import date
from itertools import repeat 
from progressbar import ProgressBar
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LinearRegression 
from sklearn import metrics 
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error, accuracy_score
import time

In [2]:
start_time = time.time()

In [3]:
os.chdir("../")

In [4]:
#Loading the datasets
train = pd.read_csv('./data/road-train-pre.csv', error_bad_lines=False)
test = pd.read_csv('./data/road-test-pre.csv', error_bad_lines=False)

In [4]:
#Sort the datasets and trainsofm to datetime
train['event time:timestamp'] = pd.to_datetime(train['event time:timestamp'])
test['event time:timestamp'] = pd.to_datetime(test['event time:timestamp'])
train=train.sort_values(by=["case concept:name", "event time:timestamp"])
test=test.sort_values(by=["case concept:name", "event time:timestamp"])

In [5]:
#added new column to calculate time differences
#didn't actually use time diffs, since accuracy went down
train['diff']=train.groupby('case concept:name')['event time:timestamp'].diff().fillna(pd.Timedelta(seconds=0))
test['diff'] = test.groupby('case concept:name')['event time:timestamp'].diff().fillna(pd.Timedelta(seconds=0))

In [6]:
#convert them to negative values, to prevent same labeling as cases
train['diff'] = train['diff'].dt.days *-1
test['diff'] = test['diff'].dt.days *-1

In [7]:
cases = list(train['event concept:name'].unique()) + list(test['event concept:name'].unique())
cases.append('New Case')
cases = list(set(cases))

# Decision Tree

Train Data

In [8]:
train.to_csv("fixed.csv")

file = open('fixed.csv', 'r')
log = dict()
with open('fixed.csv', 'r') as file:
    next(file)
    for line in file:
        line = line.strip()
        if len(line) == 0:
            continue
        parts = line.split(',')
        caseid = parts[2]

        task = parts[3]
        #added two lines
        #first one to have numeric labels for weekdays, +len cases to prevent same labeling
        #last one is only for the predicted_df in the end
        timestamp =  pd.to_datetime(parts[5]).weekday() + len(cases)
        timestamp_full = parts[5]
        if caseid not in log:
            log[caseid] = [[],[],[]]
        
        log[caseid][0].append(task)
        
        #append the timestamps.
        log[caseid][1].append(timestamp_full)
        log[caseid][2].append(timestamp)
        
        
file.close()

os.remove('fixed.csv')

In [9]:
for i in log.keys():
    current = log[i][0]  # recording the current case' events
    
    real_next = current[1:]  # next real event
    real_next.append('New Case')  # adding a 'new case' as real next event for every last event
    
    log[i].append(real_next)  # adding the real next events to the log file

Test Data

In [10]:
test.to_csv("fixed_test.csv")

file = open('fixed_test.csv', 'r')
log_test = dict()
with open('fixed_test.csv', 'r') as file:
    next(file)
    for line in file:
        line = line.strip()
        if len(line) == 0:
            continue
        parts = line.split(',')
        caseid = parts[2]
        #same as training
        task = parts[3]
        timestamp =  pd.to_datetime(parts[5]).weekday() + len(cases)
        timestamp_full = parts[5]

        if caseid not in log_test:
            log_test[caseid] = [[],[],[]]

        log_test[caseid][0].append(task)
        log_test[caseid][1].append(timestamp_full)
        log_test[caseid][2].append(timestamp)
        
file.close()

os.remove('fixed_test.csv')

In [11]:
for i in log_test.keys():
    current = log_test[i][0]  # current case' events
    
    real_next = current[1:]  # next real event
    real_next.append('New Case')  # adding a 'new case' as real next event for every last event
    log_test[i].append(real_next)

Store Data Train

In [12]:
train_data = {} 

for i in log.keys():
    for x in log[i][0]:
        case = log[i][0]
        ind = log[i][0].index(x)
        time = log[i][2]
        
        if ind not in train_data:
            train_data[ind] = [[],[],[]]
        
        
        train_data[ind][0].append(case[:ind+1])  # appending the trace
        
        #append time, only time for last event since appending the whole trace decreased accuracy
        train_data[ind][2].append(time[ind])
        if ind < len(case)-1:
            train_data[ind][1].append(case[ind+1])  # appending the next event of the trace
            
        elif ind == len(case)-1:
            train_data[ind][1].append('New Case')
           



In [13]:
test_data = {} 

for i in log_test.keys():
    for x in log_test[i][0]:
        case = log_test[i][0]
        ind = log_test[i][0].index(x)
        diff = log_test[i][1]
        times = log_test[i][2]
        if ind not in test_data:
            test_data[ind] = [[],[],[],[]]
        
        #same as above
        test_data[ind][0].append(case[:ind+1])  # appending the trace
        test_data[ind][2].append(times[ind])
        if ind < len(case)-1:
            test_data[ind][1].append(case[ind+1])  # appending the next event of the trace
            
        elif ind == len(case)-1:
            test_data[ind][1].append('New Case')

Encoding

In [14]:
cases = list(train['event concept:name'].unique()) + list(test['event concept:name'].unique())
cases.append('New Case')
cases = list(set(cases))
le = preprocessing.LabelEncoder()
le.fit(cases)  # encoding all event names into integers

LabelEncoder()

In [15]:
pbar = ProgressBar()

for i in pbar(train_data.keys()):
    encoded = []
    for trace in train_data[i][0]:  # encoding all strings into integers in the trace
        local_encoded = []
        for event in trace:
            local_encoded.append(int(le.transform([event])))
        encoded.append(local_encoded)
    
    train_data[i][0] = np.array(encoded)
    
    
    encoded_next = []
    for g in train_data[i][1]:
        encoded_next.append(int(le.transform([g])))
                            
                            
    train_data[i][1] = np.array(encoded_next)

100% |########################################################################|


In [16]:
pbar = ProgressBar()

for i in pbar(test_data.keys()):
    
    encoded = []
    for trace in test_data[i][0]:  # encoding all strings into integers in the trace
        local_encoded = []
        for event in trace:
            local_encoded.append(int(le.transform([event])))
        encoded.append(local_encoded)
    
    test_data[i][0] = np.array(encoded)
    
    
    encoded_next = []
    for g in test_data[i][1]:
        encoded_next.append(int(le.transform([g])))
                            
                            
    test_data[i][1] = np.array(encoded_next)

100% |########################################################################|


Training

In [17]:
def decision_tree(pos):
    x_train= train_data[pos][0]
    #combine full trace data with weekday
    x_week = np.array(train_data[pos][2]).reshape(-1,1)
    x_new = np.concatenate((x_train, x_week), axis=1)
    y_train= train_data[pos][1]

    classifier = DecisionTreeClassifier()
    classifier.fit(x_new, y_train)
    
    return classifier

In [18]:
predictors = {}

for i in range(len(test_data)):
    if i >= len(train_data) - 1:
        predictors[i] = decision_tree(len(train_data) - 1)
        
    else:
        predictors[i] = decision_tree(i)

Adding Predictions

In [19]:
pbar = ProgressBar()

for i in pbar(log_test.keys()):
    current = log_test[i][0]
    
    
    encoded = []
    for g in current:
        encoded.append(int(le.transform([g])))
    encoded = np.array(encoded)
    log_test[i].append(encoded)

100% |########################################################################|


In [20]:
pbar = ProgressBar()
for i in pbar(log_test.keys()):
    
    current_encoded = log_test[i][4]
    times = log_test[i][1]
    weeks = log_test[i][2]
    predictions = []
    
    for x in current_encoded:
        ind = list(current_encoded).index(x)
        
        if ind >= len(train_data) - 1:
            
            tree = predictors[len(train_data) - 1]
            p_trace = current_encoded[:(len(train_data))].reshape(-1, 11)
            #Create new array with full trace and weekdays, same as with train
            p_weeks = np.array(weeks[len(train_data)-1]).reshape(-1,1)
            p_new = np.concatenate((p_trace, p_weeks), axis=1)
            pred = tree.predict(p_new)
            pred_string = le.inverse_transform(pred)[0]
            predictions.append(pred_string)
            
            
            
        else:
            tree = predictors[ind]
            p_trace = current_encoded[:ind+1].reshape(-1,ind+1)
            #same as above
            p_weeks = np.array(weeks[ind]).reshape(-1,1)
            p_new = np.concatenate((p_trace, p_weeks), axis=1)
            pred = tree.predict(p_new)
            pred_string = le.inverse_transform(pred)[0]
            predictions.append(pred_string)
        
    log_test[i].append(predictions)

100% |########################################################################|


Evaluation

In [21]:
case_names = []
event_names = []
timestamp = []
p_event = []
current_real = []

for i in log_test.keys():
    for x in range(len(log_test[i][0])):
        case_names.append(i)
        event_names.append(log_test[i][0][x])
        #here we use the full timestamp, to ensure datetimes remain the same
        timestamp.append(log_test[i][1][x])
        p_event.append(log_test[i][len(log_test[i])-1][x])
        current_real.append(log_test[i][3][x])


frame_dict = {'Case_ID': case_names, 'Event_Name': event_names,
              'TimeStamp': timestamp, 'Next_Event': current_real, 'Predicted_Event': p_event}
predicted_df = pd.DataFrame.from_dict(frame_dict)

event_real = np.array(predicted_df['Next_Event'])
event_pred = np.array(predicted_df['Predicted_Event'])

acc = accuracy_score(event_real, event_pred)
print('Accuracy for event prediction TEST SET: {}%'.format(round(acc, 2) * 100))

Accuracy for event prediction TEST SET: 69.0%


In [22]:
predicted_df

Unnamed: 0,Case_ID,Event_Name,TimeStamp,Next_Event,Predicted_Event
0,A28905,Create Fine,2009-09-25,Send Fine,Send Fine
1,A28905,Send Fine,2010-01-19,Insert Fine Notification,Insert Fine Notification
2,A28905,Insert Fine Notification,2010-08-02,Add penalty,Add penalty
3,A28905,Add penalty,2010-09-04,Send for Credit Collection,Send for Credit Collection
4,A28905,Send for Credit Collection,2012-03-26,New Case,New Case
...,...,...,...,...,...
96346,V19305,Send Appeal to Prefecture,2012-02-24,New Case,New Case
96347,V19308,Create Fine,2011-07-10,Send Fine,Send Fine
96348,V19308,Send Fine,2012-01-30,Insert Fine Notification,Insert Fine Notification
96349,V19308,Insert Fine Notification,2012-02-24,Add penalty,Add penalty


# Linear Regression 

Train Data Preprocessing

In [23]:
#Add new useful columns for the model train
train['position_event']=train.groupby('case concept:name').cumcount()
train['position_event']=train['position_event']+1
train['week_day']=train['event time:timestamp'].dt.dayofweek

In [24]:
#Encoding all event names into integers
cases = train['event concept:name'].unique().tolist()
cases.insert(0, 'New Case')
le_case = preprocessing.LabelEncoder()
le_case.fit(cases)

LabelEncoder()

In [25]:
#Encoding lifecycle into integers
life = train['event lifecycle:transition'].unique().tolist()
le_life = preprocessing.LabelEncoder()
le_life.fit(life)

LabelEncoder()

In [26]:
#Preprocess data for model train
#Event poistion
x_train_position = np.array(train['position_event']).reshape(-1,1)[:]
#Previous event
x_train_prev = list(train['event concept:name'])
x_train_prev= le_case.transform(x_train_prev)
x_train_prev = np.array(x_train_prev).reshape(-1,1)[:]
# Event
x_train_event = list(train['event concept:name'])
x_train_event.insert(len(train), 'New Case')
x_train_event= le_case.transform(x_train_event)
x_train_event = np.array(x_train_event).reshape(-1,1)[1:]
#Day of the week previous event event
x_train_week = list(train['week_day'])
x_train_week = np.array(x_train_week).reshape(-1,1)[:]
#Timestamp event
train[['event time:timestamp']] = train[['event time:timestamp']].astype(str)
x_train_date = list(train['event time:timestamp'])
x_train_date.insert(len(train), None)
x_train_date=np.array(x_train_date).reshape(-1,1)[1:]
#Timestamp previous event
x_train_date_prev = list(train['event time:timestamp'])
x_train_date_prev=np.array(x_train_date_prev).reshape(-1,1)[:]
#Event Lifecycle
x_train_life = list(train['event lifecycle:transition'])
x_train_life= le_life.transform(x_train_life)
x_train_life = np.array(x_train_life).reshape(-1,1)[:]

In [27]:
#Length case for train set
cases = train.groupby(['case concept:name'])
per_case = pd.DataFrame({'no of events':cases['eventID '].count()})
lst_per_case = per_case["no of events"].tolist()
case_length = []
for length in lst_per_case:
    case_length.extend(repeat(length, length))
x_train_length_case=np.array(case_length).reshape(-1,1)[:]

In [28]:
#Combine features for the model train
x_train_new = np.concatenate((x_train_position,x_train_prev, x_train_event, x_train_week, x_train_date, x_train_date_prev, x_train_length_case, x_train_life), axis=1)

In [29]:
#Add features to new dataframe train
df_train = pd.DataFrame(data=x_train_new, columns=['position_event', 'prev_event', 'event', 'week_day_prev', 'date', 'date_prev', 'case_length', 'lifecycle'])
df_train.loc[df_train['position_event'] == df_train['case_length'], 'event'] = 5
df_train[['date','date_prev']] = df_train[['date','date_prev']].apply(pd.to_datetime)
df_train.loc[df_train['event'] == 5, 'date'] = None
df_train['in_between'] = (df_train['date'] - df_train['date_prev']).dt.days
df_train.loc[df_train['event'] == 5, 'in_between'] = 0
df_train

Unnamed: 0,position_event,prev_event,event,week_day_prev,date,date_prev,case_length,lifecycle,in_between
0,1,10,2,4,2006-07-24,2006-05-12,2,0,73.0
1,2,2,5,0,NaT,2006-07-24,2,0,0.0
2,1,2,10,2,2006-12-12,2006-02-08,5,0,307.0
3,2,10,4,1,2007-01-15,2006-12-12,5,0,34.0
4,3,4,0,0,2007-03-16,2007-01-15,5,0,60.0
...,...,...,...,...,...,...,...,...,...
455366,1,4,2,3,2002-07-09,2002-04-11,5,0,89.0
455367,2,2,10,1,2002-10-25,2002-07-09,5,0,108.0
455368,3,10,0,4,2003-03-01,2002-10-25,5,0,127.0
455369,4,0,11,5,2004-10-01,2003-03-01,5,0,580.0


Train Dummies

In [30]:
#Implementing dummies train
df_train=pd.get_dummies(df_train, columns=['event', 'prev_event', 'week_day_prev', 'position_event', 'lifecycle'])
df_train = df_train.drop(['date', 'date_prev'], 1)
df_train

Unnamed: 0,case_length,in_between,event_0,event_1,event_2,event_3,event_4,event_5,event_6,event_7,...,position_event_9,position_event_10,position_event_11,position_event_12,position_event_13,position_event_14,position_event_15,position_event_16,position_event_17,lifecycle_0
0,2,73.0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2,0.0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
2,5,307.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,5,34.0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,5,60.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
455366,5,89.0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
455367,5,108.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
455368,5,127.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
455369,5,580.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


Test Data Preprocessing

In [31]:
#Add new useful columns for the model test
test['position_event']=test.groupby('case concept:name').cumcount()
test['position_event']=test['position_event']+1
test['week_day']=test['event time:timestamp'].dt.dayofweek
predicted_events=predicted_df['Predicted_Event'][:].tolist()
test['pred_event']=predicted_events

In [32]:
#Preprocess data for model test
#Event poistion
x_test_position = np.array(test['position_event']).reshape(-1,1)[:]
#Previous event
x_test_prev = test['event concept:name'].tolist()
x_test_prev = le_case.transform(x_test_prev)
x_test_prev = np.array(x_test_prev).reshape(-1,1)[:]
#Predicted Event
x_test_event = test['pred_event'].tolist()
x_test_event= le_case.transform(x_test_event)
x_test_event = np.array(x_test_event).reshape(-1,1)[:]
#Day of the week previous event
x_test_week = test['week_day'].tolist()
x_test_week = np.array(x_test_week).reshape(-1,1)[:]
#Timestamp event
test[['event time:timestamp']] = test[['event time:timestamp']].astype(str)
x_test_date = list(test['event time:timestamp'])
x_test_date.insert(len(test), None)
x_test_date=np.array(x_test_date).reshape(-1,1)[1:]
#Timestamp previous event
x_test_date_prev = list(test['event time:timestamp'])
x_test_date_prev=np.array(x_test_date_prev).reshape(-1,1)[:]
#Event Lifecycle
x_test_life = test['event lifecycle:transition'].tolist()
x_test_life= le_life.transform(x_test_life)
x_test_life = np.array(x_test_life).reshape(-1,1)[:]

In [33]:
#Length case for test set
test_cases = test.groupby(['case concept:name'])
per_case_test = pd.DataFrame({'no of events':test_cases['eventID '].count()})
lst_per_case_test = per_case_test["no of events"].tolist()
case_length_test = []
for length in lst_per_case_test:
    case_length_test.extend(repeat(length, length))
x_test_length_case=np.array(case_length_test).reshape(-1,1)[:]

In [34]:
#Combine features for the model test
x_test_new = np.concatenate((x_test_position ,x_test_prev, x_test_event, x_test_week, x_test_date, x_test_date_prev, x_test_length_case, x_test_life), axis=1)

In [35]:
#Add features to new dataframe test
df_test = pd.DataFrame(data=x_test_new, columns=['position_event', 'prev_event', 'event', 'week_day_prev', 'date', 'date_prev', 'case_length', 'lifecycle'])
df_test.loc[df_test['position_event'] == df_test['case_length'], 'date'] = None
df_test[['date','date_prev']] = df_test[['date','date_prev']].apply(pd.to_datetime)
df_test['in_between'] = (df_test['date'] - df_test['date_prev']).dt.days
df_test.loc[df_test['position_event'] == df_test['case_length'], 'in_between'] = 0
df_test

Unnamed: 0,position_event,prev_event,event,week_day_prev,date,date_prev,case_length,lifecycle,in_between
0,1,2,10,4,2010-01-19,2009-09-25,5,0,116.0
1,2,10,4,1,2010-08-02,2010-01-19,5,0,195.0
2,3,4,0,0,2010-09-04,2010-08-02,5,0,33.0
3,4,0,11,5,2012-03-26,2010-09-04,5,0,569.0
4,5,11,5,0,NaT,2012-03-26,5,0,0.0
...,...,...,...,...,...,...,...,...,...
96346,6,9,5,4,NaT,2012-02-24,6,0,0.0
96347,1,2,10,6,2012-01-30,2011-07-10,4,0,204.0
96348,2,10,4,0,2012-02-24,2012-01-30,4,0,25.0
96349,3,4,0,4,2012-04-24,2012-02-24,4,0,60.0


In [36]:
#Remove cases with more events than the cases in the train set
df_test=df_test[df_test['case_length']<=max(df_train['case_length'])]
df_test

Unnamed: 0,position_event,prev_event,event,week_day_prev,date,date_prev,case_length,lifecycle,in_between
0,1,2,10,4,2010-01-19,2009-09-25,5,0,116.0
1,2,10,4,1,2010-08-02,2010-01-19,5,0,195.0
2,3,4,0,0,2010-09-04,2010-08-02,5,0,33.0
3,4,0,11,5,2012-03-26,2010-09-04,5,0,569.0
4,5,11,5,0,NaT,2012-03-26,5,0,0.0
...,...,...,...,...,...,...,...,...,...
96346,6,9,5,4,NaT,2012-02-24,6,0,0.0
96347,1,2,10,6,2012-01-30,2011-07-10,4,0,204.0
96348,2,10,4,0,2012-02-24,2012-01-30,4,0,25.0
96349,3,4,0,4,2012-04-24,2012-02-24,4,0,60.0


Test Dummies

In [37]:
#Implementing dummies test
df_test=pd.get_dummies(df_test, columns=['event', 'prev_event', 'week_day_prev', 'position_event', 'lifecycle'])
df_test = df_test.drop(['date', 'date_prev'], 1)
df_test

Unnamed: 0,case_length,in_between,event_0,event_1,event_2,event_3,event_4,event_5,event_6,event_7,...,position_event_7,position_event_8,position_event_9,position_event_10,position_event_11,position_event_12,position_event_13,position_event_14,position_event_15,lifecycle_0
0,5,116.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,5,195.0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,5,33.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,5,569.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,5,0.0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96346,6,0.0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
96347,4,204.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
96348,4,25.0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
96349,4,60.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


Feature selection and model training

In [38]:
col_train=df_train.columns
col_test=df_test.columns
features=set(col_train).intersection(col_test)
features.discard('in_between')
X_train = df_train[features] # Features
y_train = df_train['in_between'] # Target variable
X_test = df_test[features] # Features
y_test = df_test['in_between'] # Target variable

#Training the algorithm
regressor = LinearRegression()  
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [39]:
print(regressor.intercept_)
print(regressor.coef_)

-4019684381030.4917
[ 5.09120620e+12  1.03415453e+12 -2.54724469e+01  5.09120620e+12
 -2.57414544e+01  5.09120620e+12  1.03415453e+12  1.03415453e+12
 -2.80863346e+01 -2.10573304e+12  1.03415453e+12  5.09120620e+12
 -2.10573304e+12  1.66919093e+00  1.03415453e+12  1.03415453e+12
 -7.00545127e+01 -2.10573304e+12 -2.10573304e+12  5.09120620e+12
 -7.40265535e+01  5.09120620e+12 -3.50667282e+01 -5.40064505e+00
  5.09120620e+12 -2.10573304e+12  5.09120620e+12  8.99751860e+00
 -7.96461844e+01 -2.10573304e+12  1.03415453e+12 -2.10573304e+12
  1.03415453e+12 -1.70448865e+01  1.03415453e+12  1.03415453e+12
  5.09120620e+12  1.03415453e+12 -1.44793125e+01  5.09120620e+12
  5.09120620e+12 -2.19391876e+01  5.66819296e+07  5.09120620e+12
  2.36453561e+00 -2.96133271e+01  1.79014571e+01]


Evaluation

In [40]:
y_pred = regressor.predict(X_test)
df_predictions = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df_predictions

Unnamed: 0,Actual,Predicted
0,116.0,99.180176
1,195.0,60.246582
2,33.0,66.788086
3,569.0,576.601562
4,0.0,-0.291504
...,...,...
96346,0.0,-4.975586
96347,204.0,104.351562
96348,25.0,61.404297
96349,60.0,73.202148


In [41]:
#R-squared value for the model train
regressor.score(X_train, y_train)

0.7025703424636746

In [42]:
#R-squared value for the model test
regressor.score(X_test, y_test)

-0.3210899050031637

In [43]:
#Root Mean Squared Error of the model
rmse = sqrt(mean_squared_error(y_test, y_pred))
rmse

166.3949663609857

In [44]:
print("--- %s seconds ---" % (time.time() - start_time))

AttributeError: 'list' object has no attribute 'time'