In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import  accuracy_score, confusion_matrix

## Adjust the following blocks
so it will load the template file specified by me and the csv file with your submission.  

In [3]:
# For really scoring your submissions a file containing the true labels will be used.
# For checking your submission formally now, we will check against a table derived from the
# test set label by adding random labels:
reference = pd.read_csv('tip25_testdata_template.csv')
n = len(reference.index)
print('Size of test set: ', n)
reference.tip = np.random.choice([False, True], size=n)
reference.head()

Size of test set:  39419


Unnamed: 0.1,Unnamed: 0,order_id,tip
0,17,2196797,False
1,34,1827621,False
2,64,2461523,True
3,156,2614670,True
4,175,3110252,False


In [4]:
# Load the submission with True/False classification (Deliverable 1 from the main task)

submission_A = pd.read_csv('Team_2_submission_A.csv')
submission_A.head()

Unnamed: 0.1,Unnamed: 0,order_id,tip
0,17,2196797,False
1,34,1827621,True
2,64,2461523,True
3,156,2614670,False
4,175,3110252,False


In [5]:
# Load the submission with tip-probability predictions (Deliverable 2 from the main task)
submission_B = pd.read_csv('Team_2_submission_B.csv')
submission_B.head()

Unnamed: 0.1,Unnamed: 0,order_id,tip
0,17,2196797,0.200934
1,34,1827621,0.902289
2,64,2461523,0.758839
3,156,2614670,0.25758
4,175,3110252,0.191941


In [6]:
def evaluate_A(submitted_df):
    print('Submission A:')
    print('Size of submission: ', len(submitted_df.index))
    print('All the subsequent checks should return 0 for a correct submission:')
    print('Missing orders:\t'+str((~reference.order_id.isin(submitted_df.order_id)).sum()))
    print('Superfluous orders:\t'+str((~submitted_df.order_id.isin(reference.order_id)).sum()))
    print('Wrong values or NaNs:'+str(sum(submitted_df.tip.isna() | (~submitted_df.tip.isin([True, False])))))
    print('')
    print('Here the accuracy computation is simulated: If this code throws errors, then probably the column with the tip predictions contains invalid entries.')
    print('If it shows an accuracy close to 0.5 everything seems to be fine.')
    join = pd.merge(reference, submitted_df, how='inner', on=['order_id'], suffixes=('_true','_pred'))
    accuracy = accuracy_score(join.tip_true, join.tip_pred)
    print('Accuracy: ', accuracy, '\t =', accuracy_score(join.tip_true, join.tip_pred, normalize=False),' of ', len(reference.index),' samples are classified correctly.' )
    print('Confusion:')
    print(confusion_matrix(join.tip_true, join.tip_pred,normalize='true'))
    print('')

    return accuracy

In [7]:
def evaluate_B(submitted_df):
    print('Submission B:')
    print('Size of submission: ', len(submitted_df.index))
    print('All the subsequent checks should return 0 for a correct submission:')
    print('Missing orders:\t'+str((~reference.order_id.isin(submitted_df.order_id)).sum()))
    print('Superfluous orders:\t'+str((~submitted_df.order_id.isin(reference.order_id)).sum()))
    print('Wrong values or NaNs:'+str(sum(submitted_df.tip.isna() | (submitted_df.tip >1) | (submitted_df.tip<0))))
    num_01 = sum(submitted_df.tip==0)+sum(submitted_df.tip==1)
    print('Wrong probabilities:\t'+str(num_01))
    print('Probability 0 or 1 is predicted '+ str(num_01) + ' times.')
    if num_01>0:
      print('It is dangerous to predict probability 0 or 1: If only one of such predictions is wrong, then your log-loss will be infinity.')
    print('')
    print('Here the log-loss is computed. If this code throws errors, then probably the column with the tip probabilities contains invalid entries.')
    join = pd.merge(reference, submitted_df, how='inner', on=['order_id'], suffixes=('_true','_pred'))
    join['ll'] = np.where(reference['tip'] == True, -np.log(submitted_df.tip), -np.log(1-submitted_df.tip))
    ll = join.ll.mean()
    print('Log-loss: ', ll)
    print('')

    return ll

In [8]:
evaluate_A(submission_A)

Submission A:
Size of submission:  39419
All the subsequent checks should return 0 for a correct submission:
Missing orders:	0
Superfluous orders:	0
Wrong values or NaNs:0

Here the accuracy computation is simulated: If this code throws errors, then probably the column with the tip predictions contains invalid entries.
If it shows an accuracy close to 0.5 everything seems to be fine.
Accuracy:  0.5028539536771608 	 = 19822.0  of  39419  samples are classified correctly.
Confusion:
[[0.55615594 0.44384406]
 [0.55095071 0.44904929]]



0.5028539536771608

In [9]:
evaluate_B(submission_B)

Submission B:
Size of submission:  39419
All the subsequent checks should return 0 for a correct submission:
Missing orders:	0
Superfluous orders:	0
Wrong values or NaNs:0
Wrong probabilities:	0
Probability 0 or 1 is predicted 0 times.

Here the log-loss is computed. If this code throws errors, then probably the column with the tip probabilities contains invalid entries.
Log-loss:  0.872070631499428



0.872070631499428