In [1]:
from sklearn.preprocessing import LabelEncoder
import joblib
import pandas as pd

def encodeCategoricalColumns(df, categories):
    d = {}
    for col in categories:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        d[col] = le
        
    return d

def reEncodeCategoricalColumns(df, categories, encoders):
    for col in categories:
        df[col] = encoders[col].transform(df[col])

def decodeCategoricalColumns(df, categories, encoders):
    for col in categories:
        print(df[col].name, encoders[col].classes_)
        df[col] = encoders[col].inverse_transform(df[col])
        
def printEncoders(encoders):
    for key, val in encoders.items():
        print(key, val.classes_) 

In [4]:
def generateClfModel(target_col, timer):
    import pandas as pd
    import os, sys
    from tpot import TPOTClassifier
    from sklearn.preprocessing import LabelEncoder
    from sklearn.model_selection import train_test_split

    df = pd.read_csv('./bike-buyer_clf.csv')

    categories = df.select_dtypes(include='object').columns
    
    encoders = encodeCategoricalColumns(df, categories)
    X = df.loc[:, df.columns != target_col].values
    Y = df.loc[:, target_col].values

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.1, random_state = 0)

    print(f'Running for {timer} mins')
    tpot = TPOTClassifier(generations=5, population_size=50, verbosity=10, max_time_mins = timer)
    tpot.fit(X_train, Y_train)

    acc = tpot.score(X_test, Y_test)
    print('################\n R2 Score: ', acc, '\n################')
    joblib.dump(tpot.fitted_pipeline_, './pipelineClf.pkl')
    joblib.dump(encoders, './encodersClf.pkl')

    return acc

def predict_csv_clf(folderName, target_col):
    #from sklearn.externals import joblib
    from sklearn.preprocessing import LabelEncoder
    import pandas as pd
    
    try:
        pipeline = joblib.load(f'./datasets/{folderName}/pipelineClf.pkl')
        encoders = joblib.load(f'./datasets/{folderName}/encodersClf.pkl')

    except FileNotFoundError:
        return False

    # dataset must not have the predictive column
    df = pd.read_csv(f'./datasets/{folderName}/test.csv')
    
    X = df.copy()
    categories = X.select_dtypes(include='object').columns
    reEncodeCategoricalColumns(X, categories, encoders)
    X = X.values
    #print('DF: ',df.shape,'\tX: ',X.shape)

    pred = pipeline.predict(X)
    if (target_col in encoders.keys()):
        pred = encoders[target_col].inverse_transform(pred)

    df[f'{target_col} (Predicted)'] = pd.Series(pred, index = df.index)
    df.to_csv(f'./datasets/{folderName}/test_predicted.csv')

    return True

In [5]:
generateClfModel('Bike Buyer', 20/60)

Running for 0.3333333333333333 mins
31 operators have been imported by TPOT.
                                                                                                                       
0.33 minutes have elapsed. TPOT will close down.                                                                       
TPOT closed during evaluation in one generation.
                                                                                                                       
                                                                                                                       
TPOT closed prematurely. Will use the current best pipeline.
################                                                                                                       
 R2 Score:  0.9117221418234442 
################


0.9117221418234442

In [6]:
pipeline = joblib.load('./pipelineClf.pkl')
encoders = joblib.load('./encodersClf.pkl')

In [7]:
printEncoders(encoders)

Marital Status ['Married' 'Single']
Gender ['Female' 'Male']
Education ['Bachelors' 'Graduate Degree' 'High School' 'Partial College'
 'Partial High School']
Occupation ['Clerical' 'Management' 'Manual' 'Professional' 'Skilled Manual']
Home Owner ['No' 'Yes']
Region ['Europe' 'North America' 'Pacific']
Bike Buyer ['No' 'Yes']


In [41]:
# dataset must not have the predictive column
df2 = pd.read_csv('bike-buyer_clf_test.csv')
df2.head()

Unnamed: 0,ID,Marital Status,Gender,Yearly Income,Children,Education,Occupation,Home Owner,Cars,Commute Distance,Region,Age
0,22774,Married,Male,60000,1,Partial College,Professional,Yes,2,5,North America,56
1,17517,Single,Male,80000,0,Bachelors,Management,Yes,1,7,North America,33
2,11941,Single,Male,60000,0,Partial College,Skilled Manual,Yes,2,1,North America,29
3,19776,Married,Female,40000,0,High School,Skilled Manual,Yes,2,6,North America,31
4,27295,Single,Male,70000,0,Graduate Degree,Professional,Yes,0,7,North America,40


In [42]:
categories = df2.select_dtypes(include='object').columns
categories

Index(['Marital Status', 'Gender', 'Education', 'Occupation', 'Home Owner',
       'Region'],
      dtype='object')

In [43]:
X = df2.copy()
X.head()

Unnamed: 0,ID,Marital Status,Gender,Yearly Income,Children,Education,Occupation,Home Owner,Cars,Commute Distance,Region,Age
0,22774,Married,Male,60000,1,Partial College,Professional,Yes,2,5,North America,56
1,17517,Single,Male,80000,0,Bachelors,Management,Yes,1,7,North America,33
2,11941,Single,Male,60000,0,Partial College,Skilled Manual,Yes,2,1,North America,29
3,19776,Married,Female,40000,0,High School,Skilled Manual,Yes,2,6,North America,31
4,27295,Single,Male,70000,0,Graduate Degree,Professional,Yes,0,7,North America,40


In [44]:
df2.head()

Unnamed: 0,ID,Marital Status,Gender,Yearly Income,Children,Education,Occupation,Home Owner,Cars,Commute Distance,Region,Age
0,22774,Married,Male,60000,1,Partial College,Professional,Yes,2,5,North America,56
1,17517,Single,Male,80000,0,Bachelors,Management,Yes,1,7,North America,33
2,11941,Single,Male,60000,0,Partial College,Skilled Manual,Yes,2,1,North America,29
3,19776,Married,Female,40000,0,High School,Skilled Manual,Yes,2,6,North America,31
4,27295,Single,Male,70000,0,Graduate Degree,Professional,Yes,0,7,North America,40


In [45]:
reEncodeCategoricalColumns(X, categories, encoders)

In [46]:
df2.head()

Unnamed: 0,ID,Marital Status,Gender,Yearly Income,Children,Education,Occupation,Home Owner,Cars,Commute Distance,Region,Age
0,22774,Married,Male,60000,1,Partial College,Professional,Yes,2,5,North America,56
1,17517,Single,Male,80000,0,Bachelors,Management,Yes,1,7,North America,33
2,11941,Single,Male,60000,0,Partial College,Skilled Manual,Yes,2,1,North America,29
3,19776,Married,Female,40000,0,High School,Skilled Manual,Yes,2,6,North America,31
4,27295,Single,Male,70000,0,Graduate Degree,Professional,Yes,0,7,North America,40


In [47]:
df2.columns

Index(['ID', 'Marital Status', 'Gender', 'Yearly Income', 'Children',
       'Education', 'Occupation', 'Home Owner', 'Cars', 'Commute Distance',
       'Region', 'Age'],
      dtype='object')

In [48]:
df2.shape

(100, 12)

In [50]:
pipeline.predict(X)

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0])

In [51]:
df2.head()

Unnamed: 0,ID,Marital Status,Gender,Yearly Income,Children,Education,Occupation,Home Owner,Cars,Commute Distance,Region,Age
0,22774,Married,Male,60000,1,Partial College,Professional,Yes,2,5,North America,56
1,17517,Single,Male,80000,0,Bachelors,Management,Yes,1,7,North America,33
2,11941,Single,Male,60000,0,Partial College,Skilled Manual,Yes,2,1,North America,29
3,19776,Married,Female,40000,0,High School,Skilled Manual,Yes,2,6,North America,31
4,27295,Single,Male,70000,0,Graduate Degree,Professional,Yes,0,7,North America,40


In [25]:
#print('DF: ',df.shape,'\tX: ',X.shape)

pred = encoders['Bike Buyer'].inverse_transform(pipeline.predict(df2))

df2['Target (Predicted)'] = pd.Series(pred, index = df2.index)
df2.to_csv('./test_predicted.csv')