Importing the required libraries

In [4]:
import numpy as np
import pandas as pd
import sklearn


In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
training = pd.read_csv('/content/drive/MyDrive/SYNERGEN Exercise_Train.csv')

Data Exploration

In [7]:
training.tail(3)

Unnamed: 0,UNIQUE_ID,LIST_OF_INGREDIENTS,PREPARATION_METHOD,MANUFACTURED_DATE,MANUFACTURED_LOCATION,QUANTITY,APPETIZING_COLOR,ATTRACTIVE_PACKAGING,SUBMISSION_DATE,QUALITY_ASSURANCE_ENTITY,RESPONSE
182,NopCApkw4nCCwQw,X6,D10,2017-02-02,Y1,50,0,1,2017-02-03,Z1,0
183,hntvWJULUiJSybI,"X1, X2, X3, X4, X5","D2, D1, D3",2017-02-05,Y3,150,0,1,2017-02-06,Z1,0
184,w7WNrNenF4h9C5,"X1, X2, X3, X4, X5","D1, D2, D3",2017-12-31,Y1,50,1,1,2018-01-01,Z1,0


In [8]:
training.columns

Index(['UNIQUE_ID', 'LIST_OF_INGREDIENTS', 'PREPARATION_METHOD',
       'MANUFACTURED_DATE', 'MANUFACTURED_LOCATION', 'QUANTITY',
       'APPETIZING_COLOR', 'ATTRACTIVE_PACKAGING', 'SUBMISSION_DATE',
       'QUALITY_ASSURANCE_ENTITY', 'RESPONSE'],
      dtype='object')

We use tabular data in Machine Learning (usually, load data as CSV files.) If we take a single cell from the 'LIST OF INGREDIENTS' and PREPARATION METHOD, each cell represents a list of properties. As a result, we must process and create colours to represent those properties.

In [9]:
#identifying distinctive properties in those two columns

def find_unique_items(series):
    unique = set()
    for elements in series.iteritems():
        ingre = elements[1]
        for element in ingre.split(','):
            unique.add(element.strip())
    return unique

In [10]:
print('Unique values in LIST_OF_INGREDIENTS: {}'.format(find_unique_items(training['LIST_OF_INGREDIENTS'])))
print('Unique values in PREPARATION_METHOD: {}'.format(find_unique_items(training['PREPARATION_METHOD'])))

Unique values in LIST_OF_INGREDIENTS: {'X8', 'X1', 'X6', 'X2', 'X3', 'X9', 'X10', 'X5', 'X7', 'X4'}
Unique values in PREPARATION_METHOD: {'D10', 'D8', 'D3', 'D5', 'D15', 'D2', 'D11', 'D4', 'D1'}


In [11]:
# Next, those list of properties in LIST_OF_INGREDIENTS and PREPARATION_METHOD
# replaces with a set of columns
def ingregient_extractor(x, ingredient):
    for element in x.split(','):
        y = element.strip()
        if y == ingredient:
            return 1
    return 0

In [12]:
training['X_1'] = training['LIST_OF_INGREDIENTS'].apply(ingregient_extractor, args=('X1',))
training['X_2'] = training['LIST_OF_INGREDIENTS'].apply(ingregient_extractor, args=('X2',))
training['X_3'] = training['LIST_OF_INGREDIENTS'].apply(ingregient_extractor, args=('X3',))
training['X_4'] = training['LIST_OF_INGREDIENTS'].apply(ingregient_extractor, args=('X4',))
training['X_5'] = training['LIST_OF_INGREDIENTS'].apply(ingregient_extractor, args=('X5',))
training['X_6'] = training['LIST_OF_INGREDIENTS'].apply(ingregient_extractor, args=('X6',))
training['X_7'] = training['LIST_OF_INGREDIENTS'].apply(ingregient_extractor, args=('X7',))
training['X_8'] = training['LIST_OF_INGREDIENTS'].apply(ingregient_extractor, args=('X8',))
training['X_9'] = training['LIST_OF_INGREDIENTS'].apply(ingregient_extractor, args=('X9',))
training['X_10'] = training['LIST_OF_INGREDIENTS'].apply(ingregient_extractor, args=('X10',))

training['D_1'] = training['PREPARATION_METHOD'].apply(ingregient_extractor, args=('D1',)) 
training['D_2'] = training['PREPARATION_METHOD'].apply(ingregient_extractor, args=('D2',)) 
training['D_3'] = training['PREPARATION_METHOD'].apply(ingregient_extractor, args=('D3',)) 
training['D_4'] = training['PREPARATION_METHOD'].apply(ingregient_extractor, args=('D4',)) 
training['D_5'] = training['PREPARATION_METHOD'].apply(ingregient_extractor, args=('D5',)) 

training['D_8'] = training['PREPARATION_METHOD'].apply(ingregient_extractor, args=('D8',)) 
training['D_10'] = training['PREPARATION_METHOD'].apply(ingregient_extractor, args=('D10',)) 
training['D_11'] = training['PREPARATION_METHOD'].apply(ingregient_extractor, args=('D11',)) 
training['D_15'] = training['PREPARATION_METHOD'].apply(ingregient_extractor, args=('D15',)) 

Data Cleaning

In [13]:
# finally, we drop LIST_OF_INGREDIENTS and PREPARATION_METHOD
del training['LIST_OF_INGREDIENTS']
del training['PREPARATION_METHOD']

In [14]:
training.groupby('RESPONSE').count()['UNIQUE_ID']

RESPONSE
0    90
1    95
Name: UNIQUE_ID, dtype: int64

Encoding categorical data create our training features

In [15]:
training_features = pd.concat([training[['X_1', 'X_2', 'X_3', 'X_4', 'X_5', 
                                         'X_6', 'X_7', 'X_8', 'X_9', 'X_10', 
                                         'D_1', 'D_2', 'D_3', 'D_4', 'D_5', 
                                         'D_8', 'D_10', 'D_11', 'D_15',
                                         'QUANTITY', 'ATTRACTIVE_PACKAGING']], 
                               pd.get_dummies(training[['MANUFACTURED_LOCATION']]),
                               pd.get_dummies(training[['QUALITY_ASSURANCE_ENTITY']])], 
                               axis=1)

Using Logistic Regression and Random Forest for building our model and hypertuning 

In [16]:
%%time 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import scale

X_train = training_features.values
y_train = training['RESPONSE'].values

folds = KFold(n_splits=3, shuffle=True)
cv_accuracies = []
for trining_idx, testing_idx in folds.split(X_train):
    X_train_cv = X_train[trining_idx]
    y_train_cv = y_train[trining_idx]
    
    X_test_cv = X_train[testing_idx]
    y_test_cv = y_train[testing_idx]
    
    logistic_regression = LogisticRegression()
    logistic_regression.fit(scale(X_train_cv), y_train_cv)
    y_predict_cv = logistic_regression.predict(scale(X_test_cv))
    current_accuracy = accuracy_score(y_test_cv, y_predict_cv)
    cv_accuracies.append(current_accuracy)
    print('cross validation accuracy: {}'.format(current_accuracy))
    
print( '---------------------------------------')
print( 'average corss validation accuracy: %f' %(sum(cv_accuracies)/len(cv_accuracies)))

cross validation accuracy: 0.8870967741935484
cross validation accuracy: 0.6774193548387096
cross validation accuracy: 0.7049180327868853
---------------------------------------
average corss validation accuracy: 0.756478
CPU times: user 122 ms, sys: 21.1 ms, total: 143 ms
Wall time: 389 ms


In [17]:
%%time 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

X_train = training_features.values
y_train = training['RESPONSE'].values

folds = KFold(n_splits=3, shuffle=True)
cv_accuracies = []
for trining_idx, testing_idx in folds.split(X_train):
    X_train_cv = X_train[trining_idx]
    y_train_cv = y_train[trining_idx]
    
    X_test_cv = X_train[testing_idx]
    y_test_cv = y_train[testing_idx]
    
    random_forest = RandomForestClassifier(n_estimators = 100)
    random_forest.fit(scale(X_train_cv), y_train_cv)
    y_predict_cv = random_forest.predict(scale(X_test_cv))
    current_accuracy = accuracy_score(y_test_cv, y_predict_cv)
    cv_accuracies.append(current_accuracy)
    print( 'cross validation accuracy: %f' %(current_accuracy))

    
print('---------------------------------------')
print('average corss validation accuracy: %f' %(sum(cv_accuracies)/len(cv_accuracies))) 
print( '---------------------------------------\n')

cross validation accuracy: 0.822581
cross validation accuracy: 0.790323
cross validation accuracy: 0.770492
---------------------------------------
average corss validation accuracy: 0.794465
---------------------------------------

CPU times: user 825 ms, sys: 11.4 ms, total: 836 ms
Wall time: 1.5 s


Going ahead with random forest because it works better 

In [18]:
X_train = training_features.values
y_train = training['RESPONSE'].values
random_forest = RandomForestClassifier(n_estimators = 100)
random_forest.fit(scale(X_train_cv), y_train_cv)

RandomForestClassifier()

In [20]:
#Cleaning the testing dataset
# Next, we clean the testing dataset
testing = pd.read_csv('/content/drive/MyDrive/SYNERGEN Exercise_Prediction.csv')

testing['X_1'] = testing['LIST_OF_INGREDIENTS'].apply(ingregient_extractor, args=('X1',))
testing['X_2'] = testing['LIST_OF_INGREDIENTS'].apply(ingregient_extractor, args=('X2',))
testing['X_3'] = testing['LIST_OF_INGREDIENTS'].apply(ingregient_extractor, args=('X3',))
testing['X_4'] = testing['LIST_OF_INGREDIENTS'].apply(ingregient_extractor, args=('X4',))
testing['X_5'] = testing['LIST_OF_INGREDIENTS'].apply(ingregient_extractor, args=('X5',))
testing['X_6'] = testing['LIST_OF_INGREDIENTS'].apply(ingregient_extractor, args=('X6',))
testing['X_7'] = testing['LIST_OF_INGREDIENTS'].apply(ingregient_extractor, args=('X7',))
testing['X_8'] = testing['LIST_OF_INGREDIENTS'].apply(ingregient_extractor, args=('X8',))
testing['X_9'] = testing['LIST_OF_INGREDIENTS'].apply(ingregient_extractor, args=('X9',))
testing['X_10'] = testing['LIST_OF_INGREDIENTS'].apply(ingregient_extractor, args=('X10',))


testing['D_1'] = testing['PREPARATION_METHOD'].apply(ingregient_extractor, args=('D1',)) 
testing['D_2'] = testing['PREPARATION_METHOD'].apply(ingregient_extractor, args=('D2',)) 
testing['D_3'] = testing['PREPARATION_METHOD'].apply(ingregient_extractor, args=('D3',)) 
testing['D_4'] = testing['PREPARATION_METHOD'].apply(ingregient_extractor, args=('D4',)) 
testing['D_5'] = testing['PREPARATION_METHOD'].apply(ingregient_extractor, args=('D5',)) 

testing['D_8'] = testing['PREPARATION_METHOD'].apply(ingregient_extractor, args=('D8',)) 
testing['D_10'] = testing['PREPARATION_METHOD'].apply(ingregient_extractor, args=('D10',)) 
testing['D_11'] = testing['PREPARATION_METHOD'].apply(ingregient_extractor, args=('D11',)) 
testing['D_15'] = testing['PREPARATION_METHOD'].apply(ingregient_extractor, args=('D15',)) 

del testing['LIST_OF_INGREDIENTS']
del testing['PREPARATION_METHOD']

testing_features = pd.concat([testing[['X_1', 'X_2', 'X_3', 'X_4', 'X_5', 'X_6', 'X_7', 'X_8', 'X_9', 'X_10', 
                                         'D_1', 'D_2', 'D_3', 'D_4', 'D_5', 'D_8', 'D_10', 'D_11', 'D_15',
                                         'QUANTITY', 'ATTRACTIVE_PACKAGING']], 
                                         pd.get_dummies(testing[['MANUFACTURED_LOCATION']]),
                                         pd.get_dummies(testing[['QUALITY_ASSURANCE_ENTITY']])], axis=1)

In [21]:
X_test = testing_features.values
output = random_forest.predict(X_test)
unique_indices = training['UNIQUE_ID'].values
for i, j in zip(output, unique_indices):
    print('index: {} prediction: {}'.format(j, i))

index: monUvMmr95OP05e prediction: 0
index: 1xbRcd2JbUuZ0IK prediction: 1
index: 8FMJ6YMJYbTC4yp prediction: 0
index: fuovowqPpCHv3W9 prediction: 0
index: monUvMmr95OP05e prediction: 0
index: monUvMmr95OP05e prediction: 1
index: monUvMmr95OP05e prediction: 1
index: monUvMmr95OP05e prediction: 0
index: monUvMmr95OP05e prediction: 1
index: monUvMmr95OP05e prediction: 0
index: winhsDL92bKQS4x prediction: 1
index: xWc5x0sOguKgkJa prediction: 1
index: 8jE26hwkyWzOOpV prediction: 0
index: j1KqeiH7KrzuW9N prediction: 1
index: lBLYpZi7P5Fbs1N prediction: 0
index: tossSzrrzX43iqu prediction: 0
index: ZIIvtO5hcTg8Tg4 prediction: 1
index: w7WNrNenF4h9C5 prediction: 1
index: z8lmAhwtP3ehr63 prediction: 1
index: QibP7kHXVqO8Ve7 prediction: 0
index: SuNat7oTPLjWsXD prediction: 1
index: 9KXp0XrXblr7bxy prediction: 0
index: w7WNrNenF4h9C5 prediction: 0
index: b8HQmdEN4W8VMfj prediction: 0
index: Cb1ZysE3Vb0BmRc prediction: 1
index: jbsQ6vrRg4FK2ea prediction: 1
index: RcECLtfRYf0pIvi prediction: 0
ind

Key points:

1. Learning from previous data is central to machine learning. The lack of data available for building the model greatly aids in generalising the model. So having more data helps to make better predictions. Unfortunately, this dataset only contains 200 data points. As a result, building a generalise model with such a small dataset is impossible.

2. Even this such a small dataset and relatively simple algorithm, we managed to achieve just above 80 percent accuracy