In [46]:
from sklearn.preprocessing import LabelEncoder
import joblib
import pandas as pd

def encodeCategoricalColumns(df, categories):
    d = {}
    for col in categories:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        d[col] = le
        
    return d

def reEncodeCategoricalColumns(df, categories, encoders):
    for col in categories:
        df[col] = encoders[col].transform(df[col])

def decodeCategoricalColumns(df, categories, encoders):
    for col in categories:
        print(df[col].name, encoders[col].classes_)
        df[col] = encoders[col].inverse_transform(df[col])
        
def printEncoders(encoders):
    for key, val in encoders.items():
        print(key, val.classes_) 

In [47]:
def generateClfModel(target_col, timer):
    import pandas as pd
    import os, sys
    from tpot import TPOTClassifier
    from sklearn.preprocessing import LabelEncoder
    from sklearn.model_selection import train_test_split

    df = pd.read_csv('./bike-buyer_clf.csv')

    categories = df.select_dtypes(include='object').columns
    
    encoders = encodeCategoricalColumns(df, categories)
    X = df.loc[:, df.columns != target_col].values
    Y = df.loc[:, [target_col]].values

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.1, random_state = 0)

    print(f'Running for {timer} mins')
    tpot = TPOTClassifier(generations=5, population_size=50, verbosity=10, max_time_mins = timer)
    tpot.fit(X_train, Y_train)

    acc = tpot.score(X_test, Y_test)
    print('################\n R2 Score: ', acc, '\n################')
    joblib.dump(tpot.fitted_pipeline_, './pipelineClf.pkl')
    joblib.dump(encoders, './encodersClf.pkl')

    return acc

In [48]:
generateClfModel('Bike Buyer', 20/60)

Running for 0.3333333333333333 mins
31 operators have been imported by TPOT.


  y = column_or_1d(y, warn=True)


                                                                                                                       
0.35 minutes have elapsed. TPOT will close down.                                                                       
TPOT closed during evaluation in one generation.
                                                                                                                       
                                                                                                                       
TPOT closed prematurely. Will use the current best pipeline.
################                                                                                                       
 R2 Score:  0.8958031837916064 
################


  y = column_or_1d(y, warn=True)


0.8958031837916064

In [49]:
pipeline = joblib.load('./pipelineClf.pkl')
encoders = joblib.load('./encodersClf.pkl')

In [50]:
printEncoders(encoders)

Marital Status ['Married' 'Single']
Gender ['Female' 'Male']
Education ['Bachelors' 'Graduate Degree' 'High School' 'Partial College'
 'Partial High School']
Occupation ['Clerical' 'Management' 'Manual' 'Professional' 'Skilled Manual']
Home Owner ['No' 'Yes']
Region ['Europe' 'North America' 'Pacific']
Bike Buyer ['No' 'Yes']


In [54]:
# dataset must not have the predictive column
df2 = pd.read_csv('bike-buyer_clf_test.csv')
df2.head()

Unnamed: 0,ID,Marital Status,Gender,Yearly Income,Children,Education,Occupation,Home Owner,Cars,Commute Distance,Region,Age,Bike Buyer
0,22774.0,Married,Male,60000,1.0,Partial College,Professional,Yes,2,5.0,North America,56,No
1,17517.0,Single,Male,80000,0.0,Bachelors,Management,Yes,1,7.0,North America,33,No
2,11941.0,Single,Male,60000,0.0,Partial College,Skilled Manual,Yes,2,1.0,North America,29,No
3,19776.0,Married,Female,40000,0.0,High School,Skilled Manual,Yes,2,6.0,North America,31,No
4,27295.0,Single,Male,70000,0.0,Graduate Degree,Professional,Yes,0,7.0,North America,40,Yes


In [55]:
categories = df2.select_dtypes(include='object').columns
categories

Index(['Marital Status', 'Gender', 'Education', 'Occupation', 'Home Owner',
       'Region', 'Bike Buyer'],
      dtype='object')

In [56]:
reEncodeCategoricalColumns(df2, categories, encoders)

In [72]:
df2.columns

Index(['ID', 'Marital Status', 'Gender', 'Yearly Income', 'Children',
       'Education', 'Occupation', 'Home Owner', 'Cars', 'Commute Distance',
       'Region', 'Age', 'Bike Buyer'],
      dtype='object')

In [74]:
df2.shape

(100, 13)

In [75]:
X.shape

(100, 12)

In [76]:
pipeline.predict(X)

array([0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0])

In [79]:
df2.head()

Unnamed: 0,ID,Marital Status,Gender,Yearly Income,Children,Education,Occupation,Home Owner,Cars,Commute Distance,Region,Age,Bike Buyer,Target (Predicted)
0,22774.0,0,1,60000,1.0,3,3,1,2,5.0,1,56,0,No
1,17517.0,1,1,80000,0.0,0,1,1,1,7.0,1,33,0,No
2,11941.0,1,1,60000,0.0,3,4,1,2,1.0,1,29,0,No
3,19776.0,0,0,40000,0.0,2,4,1,2,6.0,1,31,0,No
4,27295.0,1,1,70000,0.0,1,3,1,0,7.0,1,40,1,Yes


In [78]:
#print('DF: ',df.shape,'\tX: ',X.shape)

pred = encoders['Bike Buyer'].inverse_transform(pipeline.predict(X))

df2['Target (Predicted)'] = pd.Series(pred, index = df2.index)
df2.to_csv('./test_predicted.csv')

In [21]:
predict_csv_clf('Bike Buyer', 20/60)



ValueError: y contains previously unseen labels: ['Married' 'Single']