## Decode variables

In [25]:
#import modules
import pandas as pd
import numpy as np

#import data
train_dummies = pd.read_csv('train_data.csv')
test_dummies = pd.read_csv('test_data.csv')
train_dummies.columns[:50]

Index(['destination_Home', 'destination_No Urgent Place', 'destination_Work',
       'passenger_Alone', 'passenger_Friend(s)', 'passenger_Kid(s)',
       'passenger_Partner', 'weather_Rainy', 'weather_Snowy', 'weather_Sunny',
       'temperature', 'time_10AM', 'time_10PM', 'time_2PM', 'time_6PM',
       'time_7AM', 'coupon_Bar', 'coupon_Carry out & Take away',
       'coupon_Coffee House', 'coupon_Restaurant(20-50)',
       'coupon_Restaurant(<20)', 'expiration_1d', 'expiration_2h',
       'gender_Female', 'gender_Male', 'age_21', 'age_26', 'age_31', 'age_36',
       'age_41', 'age_46', 'age_50plus', 'age_below21',
       'maritalStatus_Divorced', 'maritalStatus_Married partner',
       'maritalStatus_Single', 'maritalStatus_Unmarried partner',
       'maritalStatus_Widowed', 'has_children', 'education_Associates degree',
       'education_Bachelors degree',
       'education_Graduate degree (Masters or Doctorate)',
       'education_High School Graduate', 'education_Some High School',

In [26]:
train_dummies.columns[50:]

Index(['occupation_Computer & Mathematical',
       'occupation_Construction & Extraction',
       'occupation_Education&Training&Library',
       'occupation_Farming Fishing & Forestry',
       'occupation_Food Preparation & Serving Related',
       'occupation_Healthcare Practitioners & Technical',
       'occupation_Healthcare Support',
       'occupation_Installation Maintenance & Repair', 'occupation_Legal',
       'occupation_Life Physical Social Science', 'occupation_Management',
       'occupation_Office & Administrative Support',
       'occupation_Personal Care & Service',
       'occupation_Production Occupations', 'occupation_Protective Service',
       'occupation_Retired', 'occupation_Sales & Related',
       'occupation_Student', 'occupation_Transportation & Material Moving',
       'occupation_Unemployed', 'income_$100000 or More',
       'income_$12500 - $24999', 'income_$25000 - $37499',
       'income_$37500 - $49999', 'income_$50000 - $62499',
       'income_$62500 

In [27]:
#decode into categorical variables
def undummify(df, prefix_sep="_"):
    cols2collapse = {
        item.split(prefix_sep)[0]: (prefix_sep in item) for item in df.columns
    }
    series_list = []
    for col, needs_to_collapse in cols2collapse.items():
        if needs_to_collapse:
            undummified = (
                df.filter(like=col)
                .idxmax(axis=1)
                .apply(lambda x: x.split(prefix_sep, maxsplit=1)[1])
                .rename(col)
            )
            series_list.append(undummified)
        else:
            series_list.append(df[col])
    undummified_df = pd.concat(series_list, axis=1)
    return undummified_df

train_decoded = undummify(train_dummies)

test_decoded = undummify(test_dummies)

train_decoded.head()

Unnamed: 0,destination,passenger,weather,temperature,time,coupon,expiration,gender,age,maritalStatus,...,occupation,income,Bar,CoffeeHouse,CarryAway,RestaurantLessThan20,Restaurant20To50,toCoupon,direction,accept
0,No Urgent Place,Alone,Sunny,80,10AM,Coffee House,2h,Female,50plus,Married partner,...,Retired,$75000 - $87499,never,never,4~8,less1,1~3,GEQ5min,opp,0
1,No Urgent Place,Partner,Sunny,80,10AM,Restaurant(20-50),2h,Male,below21,Unmarried partner,...,Student,$12500 - $24999,less1,gt8,4~8,1~3,less1,GEQ5min,opp,0
2,Work,Alone,Snowy,30,7AM,Restaurant(20-50),1d,Male,36,Married partner,...,Management,$100000 or More,1~3,1~3,1~3,1~3,1~3,GEQ5min,opp,1
3,No Urgent Place,Friend(s),Sunny,55,2PM,Carry out & Take away,1d,Female,31,Married partner,...,Sales & Related,$12500 - $24999,less1,1~3,1~3,1~3,less1,GEQ5min,opp,1
4,Home,Alone,Snowy,30,10PM,Carry out & Take away,1d,Male,46,Single,...,Sales & Related,Less than $12500,1~3,1~3,4~8,4~8,4~8,GEQ5min,opp,1


## Re-Encode Variables

In [28]:
#times dictionary
times = {'10AM':10, '10PM': 22, '2PM': 14, '7AM': 7, '6PM': 18}

#expiration dictionary
expirations = {'2h': 2, '1d': 24}

#gender dictionary
genders = {'Female': 1, 'Male': 0}

#convert times and expirations
train_decoded.time = train_decoded['time'].map(times)
train_decoded.expiration = train_decoded.expiration.map(expirations)
train_decoded.gender = train_decoded.gender.map(genders)

test_decoded.time = test_decoded['time'].map(times)
test_decoded.expiration = test_decoded.expiration.map(expirations)
test_decoded.gender = test_decoded.gender.map(genders)

train_decoded.time.unique()

array([10,  7, 14, 22, 18])

In [29]:
train_decoded.expiration.unique()

array([ 2, 24])

In [30]:
#Split into predictors and target
y_train = train_decoded.accept
X_train = train_decoded.drop('accept', axis=1)

y_test = test_decoded.accept
X_test = test_decoded.drop('accept', axis=1)

X_train.head()

Unnamed: 0,destination,passenger,weather,temperature,time,coupon,expiration,gender,age,maritalStatus,...,education,occupation,income,Bar,CoffeeHouse,CarryAway,RestaurantLessThan20,Restaurant20To50,toCoupon,direction
0,No Urgent Place,Alone,Sunny,80,10,Coffee House,2,1,50plus,Married partner,...,High School Graduate,Retired,$75000 - $87499,never,never,4~8,less1,1~3,GEQ5min,opp
1,No Urgent Place,Partner,Sunny,80,10,Restaurant(20-50),2,0,below21,Unmarried partner,...,Some college - no degree,Student,$12500 - $24999,less1,gt8,4~8,1~3,less1,GEQ5min,opp
2,Work,Alone,Snowy,30,7,Restaurant(20-50),24,0,36,Married partner,...,Associates degree,Management,$100000 or More,1~3,1~3,1~3,1~3,1~3,GEQ5min,opp
3,No Urgent Place,Friend(s),Sunny,55,14,Carry out & Take away,24,1,31,Married partner,...,Bachelors degree,Sales & Related,$12500 - $24999,less1,1~3,1~3,1~3,less1,GEQ5min,opp
4,Home,Alone,Snowy,30,22,Carry out & Take away,24,0,46,Single,...,Some college - no degree,Sales & Related,Less than $12500,1~3,1~3,4~8,4~8,4~8,GEQ5min,opp


In [31]:
#!pip install --upgrade category_encoders

In [32]:
#import module
from category_encoders.cat_boost import CatBoostEncoder

#get list of features to convert
conversion_cols = ['destination', 'passenger', 'weather', 'Coupon', 'has', 'education', 'occupation', 'income', 'Bar', 'CoffeeHouse', 'CarryAway', 'RestrauntLessThan20', 'Restaraunt20To50', 'toCoupon', 'direction']
conversion_cols = X_train.columns

#define encoder
encoder = CatBoostEncoder()

#fit and transform data
cbe_encoder = encoder.fit(X_train[conversion_cols], y_train)
train_cbe = cbe_encoder.transform(X_train[conversion_cols])
test_cbe = cbe_encoder.transform(X_test[conversion_cols])

train_cbe.head()

Unnamed: 0,destination,passenger,weather,temperature,time,coupon,expiration,gender,age,maritalStatus,...,education,occupation,income,Bar,CoffeeHouse,CarryAway,RestaurantLessThan20,Restaurant20To50,toCoupon,direction
0,0.645675,0.520571,0.592084,80,10,0.497905,2,1,0.519939,0.549563,...,0.57815,0.388996,0.462872,0.590598,0.447421,0.5873,0.56168,0.582969,0.569418,0.571378
1,0.645675,0.588725,0.592084,80,10,0.458499,2,0,0.564733,0.52589,...,0.583001,0.583124,0.553061,0.591882,0.609782,0.5873,0.551401,0.558093,0.569418,0.571378
2,0.511615,0.520571,0.50834,30,7,0.458499,24,0,0.55046,0.549563,...,0.604904,0.565418,0.59389,0.636555,0.64358,0.578512,0.551401,0.582969,0.569418,0.571378
3,0.645675,0.68209,0.592084,55,14,0.733793,24,1,0.561031,0.549563,...,0.560836,0.563272,0.553061,0.591882,0.64358,0.578512,0.551401,0.558093,0.569418,0.571378
4,0.477175,0.520571,0.50834,30,22,0.733793,24,0,0.555625,0.611811,...,0.583001,0.563272,0.606135,0.636555,0.64358,0.5873,0.575955,0.738535,0.569418,0.571378


In [33]:
train_cbe.destination.max()

0.6456750514784412

In [34]:
train_cbe.to_csv('train_cbe.csv')

test_cbe.to_csv('test_cbe.csv')

## Model Creation

In [None]:
#import module
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
rf = RandomForestClassifier()

rf.fit(X_train, y_train)

roc_auc_score(y_test, rf.predict(X_test))

ValueError: could not convert string to float: 'No Urgent Place'