In [116]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.pipeline import make_pipeline
import eli5
from eli5.sklearn import PermutationImportance

In [46]:


df = pd.read_csv('https://raw.githubusercontent.com/fivethirtyeight/data/master/thanksgiving-2015/thanksgiving-2015-poll-data.csv')



df.columns = ['ID', 'Celebrate', 'Main_Dish', 'Main_Dish_Other', 'Main_Dish_Cooked', 'Main_Dish_Cooked_Other', 'Stuffing',
              'Stuffing_Other', 'Cranberry_Sauce', 'Cranberry_Sauce_Other', 'Gravy', 'Brussel_Sprouts', 'Carrots', 'Cauliflower',
              'Corn', 'Cornbread', 'Fruit_Salad', 'Green_Beans', 'Mac_and_Cheese', 'Mashed_Potatoes', 'Rolls_Biscuits', 'Squash',
              'Salad', 'Sweet_Potatoes', 'Side_Dish_Other1', 'Side_Dish_Other2', 'Apple_Pie', 'Buttermilk_Pie', 'Cherry_Pie', 
              'Chocolate_Pie', 'Coconut_Cream_Pie', 'Key_Lime_Pie', 'Peach_Pie', 'Pecan_Pie', 'Pumpkin_Pie', 'Sweet_Potato_Pie', 
              'No_Pie', 'Other_Pie1', 'Other_Pie2', 'Apple_Cobbler', 'Blondies', 'Brownies', 'Carrot_Cake', 'Cheesecake', 'Cookies',
              'Fudge', 'Ice_Cream', 'Peach_Cobbler', 'No_Dessert', 'Other_Dessert1', 'Other_Dessert2', 'Prayer', 'Travel_Distance', 'Parade',
              'Kids_Table_Age', 'Old_Friends', 'Friendsgiving', 'Black_Friday_Shopper', 'Retail_Worker', 'Black_Friday_Worker', 'Neighborhood_Type',
              'Age', 'Gender', 'Household_Earnings', 'US_Region']

In [37]:
side_dishes = ['Stuffing',
              'Stuffing_Other', 'Cranberry_Sauce', 'Cranberry_Sauce_Other', 'Gravy', 'Brussel_Sprouts', 'Carrots', 'Cauliflower',
              'Corn', 'Cornbread', 'Fruit_Salad', 'Green_Beans', 'Mac_and_Cheese', 'Mashed_Potatoes', 'Rolls_Biscuits', 'Squash',
              'Salad', 'Sweet_Potatoes', 'Side_Dish_Other1', 'Side_Dish_Other2', 'Apple_Pie', 'Buttermilk_Pie', 'Cherry_Pie', 
              'Chocolate_Pie', 'Coconut_Cream_Pie', 'Key_Lime_Pie', 'Peach_Pie', 'Pecan_Pie', 'Pumpkin_Pie', 'Sweet_Potato_Pie', 
              'No_Pie', 'Other_Pie1', 'Other_Pie2', 'Apple_Cobbler', 'Blondies', 'Brownies', 'Carrot_Cake', 'Cheesecake', 'Cookies',
              'Fudge', 'Ice_Cream', 'Peach_Cobbler', 'No_Dessert', 'Other_Dessert1', 'Other_Dessert2', 'Casserole', 'Meat_Pie', 'Chess_Pie']

vegetables = ['Brussel_Sprouts', 'Carrots', 'Cauliflower', 'Corn', 'Green_Beans',
              'Squash', 'Salad']
starches = ['Stuffing', 'Cornbread', 'Mac_and_Cheese', 'Mashed_Potatoes', 'Rolls_Biscuits',
            'Sweet_Potatoes']
sweet_sides = ['Cranberry_Sauce', 'Fruit_Salad']
desserts = ['Apple_Pie', 'Buttermilk_Pie', 'Cherry_Pie', 
              'Chocolate_Pie', 'Coconut_Cream_Pie', 'Key_Lime_Pie', 'Peach_Pie', 
               'Pecan_Pie', 'Pumpkin_Pie', 'Sweet_Potato_Pie',
            'Apple_Cobbler', 
               'Blondies', 'Brownies', 'Carrot_Cake', 'Cheesecake', 'Cookies',
              'Fudge', 'Ice_Cream', 'Peach_Cobbler', 'Chess_Pie']
pies = ['Apple_Pie', 'Buttermilk_Pie', 'Cherry_Pie', 
              'Chocolate_Pie', 'Coconut_Cream_Pie', 'Key_Lime_Pie', 'Peach_Pie', 
               'Pecan_Pie', 'Pumpkin_Pie', 'Sweet_Potato_Pie', 'Chess_Pie', 'Meat_Pie']
food = ['Stuffing', 'Cranberry_Sauce', 'Gravy', 'Brussel_Sprouts', 'Carrots', 'Cauliflower',
       'Corn', 'Cornbread', 'Fruit_Salad', 'Green_Beans', 'Mac_and_Cheese',
       'Mashed_Potatoes', 'Rolls_Biscuits', 'Squash', 'Salad',
       'Sweet_Potatoes', 'Apple_Pie', 'Buttermilk_Pie', 'Cherry_Pie',
       'Chocolate_Pie', 'Coconut_Cream_Pie', 'Key_Lime_Pie', 'Peach_Pie',
       'Pecan_Pie', 'Pumpkin_Pie', 'Sweet_Potato_Pie',
       'Apple_Cobbler', 'Blondies', 'Brownies', 'Carrot_Cake', 'Cheesecake',
       'Cookies', 'Fudge', 'Ice_Cream', 'Peach_Cobbler']

In [51]:
#create regional groups

northeast = ['Squash', 'Apple_Pie', 'Homemade_Cranberry_Sauce', 'Cauliflower', 'Corn', 'Brussel_Sprouts', 'Roast_Turkey']
southern = ['Mac_and_Cheese', 'Cornbread', 'Sweet_Potato_Pie', 'Pecan_Pie', 'Fried_Turkey', 'Sweet_Potatoes',
           'Cornbread_Stuffing', 'Chess_Pie', 'Peach_Pie', 'Key_Lime_Pie', 'Canned_Cranberry_Sauce']
midwest_and_plains = ['Cherry_Pie', 'Pumpkin_Pie', 'Rolls_Biscuits', 'Mashed_Potatoes', 'Rice', 'Rice_Stuffing',
                      'Green_Beans', 'Casserole']
west = ['Fruit_Salad', 'Salad', 'Brussel_Sprouts', 'Apple_Pie', 'Cherry_Pie', 'Canned_Cranberry_Sauce']
hawaii = ['Smoked_Turkey', 'Rolls_Biscuits', 'Salad', 'Rice_Stuffing']

In [91]:
def wrangle(df):
    #remove any rows where they do not celebrate
    df = df.drop(df[df.Celebrate == 'No'].index)
    df = df.drop('Celebrate', axis = 1)
    #remove earnings, is creating a map of earning potential by region rather than anything to do with food
    df = df.drop('Household_Earnings', axis=1)
    
    #many columns formatted so that a column has the name of the dish if true and NaN if false
#so replace all the NaN with No and all values with Yes
#also for Parade column
    for column in df.columns:
        if 'Other' in column:
            df[column] = df[column].fillna('No')
        elif (column in side_dishes) & (column != 'Cranberry_Sauce') & (column != 'Stuffing'):
            df[column] = df[column].replace(r'.*', 'Yes', regex=True)
            df[column] = df[column].fillna('No')
        elif (column == 'Cranberry_Sauce') | (column == 'Stuffing'):
            df[column] = df[column].fillna('None')
        elif column == 'Parade':
            df[column] = df[column].replace(r'.*', 'Yes', regex=True)
            df[column] = df[column].fillna('No')
    df = df.replace('YesYes', 'Yes')
    df['Cranberry_Sauce'] = np.where((df['Cranberry_Sauce_Other'].str.contains('homemade', case=False) 
                                      & df['Cranberry_Sauce_Other'].str.contains('canned', case=False)),
                                     'Both', df['Cranberry_Sauce'])
    df['Salad'] = np.where(df['Side_Dish_Other2'].str.contains('salad', case=False), 'Yes', df['Salad'])
    df['Casserole'] = 'No'
    df['Casserole'] = np.where(df['Side_Dish_Other2'].str.contains('casserole', case=False), 'Yes', df['Casserole'])
    df['Stuffing'] = np.where((df['Stuffing_Other'].str.contains('cornbread', case=False) 
                                      | df['Stuffing_Other'].str.contains('corn bread', case=False)), 
                                        'Cornbread', df['Stuffing'])
    df['Meat_Pie'] = df['Other_Pie2'].apply(lambda x: 'Yes' if 'meat' in x.lower() else 'No')
    df['Chess_Pie'] = df['Other_Pie2'].apply(lambda x: 'Yes' if ('chess' in x.lower()) | ('cornmeal' in x.lower()) else 'No')
    df['Main_Dish_Cooked'] = np.where(df['Main_Dish_Cooked_Other'].str.contains('grilled', case=False), 'Grilled', df['Main_Dish_Cooked'])
    df['Main_Dish'] = np.where(df['Main_Dish_Other'].str.contains('turkey', case=False), 'Turkey', df['Main_Dish'])
    df['Main_Dish_Cooked'] = np.where(df['Main_Dish_Cooked_Other'].str.contains('smoked', case=False), 'Smoked', df['Main_Dish_Cooked'])
    df['Broccoli'] = df['Side_Dish_Other2'].apply(lambda x: 'Yes' if 'broccoli' in x.lower() else 'No')
    df['Pumpkin_Pie'] = np.where(df['Other_Dessert2'].str.contains('pumpkin', case=False), 'Yes', df['Pumpkin_Pie'])
    df['Rice'] = df['Side_Dish_Other2'].apply(lambda x: 'Yes' if 'rice' in x.lower() else 'No')
    for column in df.columns:
        if 'Other' in column:
            df = df.drop(column, axis=1)
#simplify main dish options
    main_dishes = ('Turkey', 'Ham/Pork', 'Tofurkey', 'Chicken')
    df['Main_Dish'] = df['Main_Dish'].apply(lambda x : x if x in main_dishes else 'Other')

#remove the ID column, it's meaningless
    df = df.drop('ID', axis=1)
#replace empty values in Black_Friday_Worker with 'No' if person isn't in retail
    df['Black_Friday_Worker'] = df['Retail_Worker'].apply(lambda x : 'No' if x == 'No' else x)

#fix NaN values in other columns
    df['Kids_Table_Age'] = df['Kids_Table_Age'].fillna('No Kids Table')
    mode = df['Age'].mode()[0]
    df['Age'] = df['Age'].fillna(mode)
    mode = df['Travel_Distance'].mode()[0]
    df['Travel_Distance'] = df['Travel_Distance'].fillna(mode)
    df = df.fillna('Other')
    
    df['Kids_Table'] = df['Kids_Table_Age'].apply(lambda x: 'No' if x == 'No Kids Table' else 'Yes')
    
    df['Fried_Turkey'] = df['Main_Dish_Cooked'].apply(lambda x: 'Yes' if x == 'Fried' else 'No')
    df['Homemade_Cranberry_Sauce'] = df['Cranberry_Sauce'].apply(lambda x: 'Yes' if (x == 'Homemade') | (x == 'Both') else 'No') 
    df['Cornbread_Stuffing'] = df['Stuffing'].apply(lambda x: 'Yes' if x == 'Cornbread' else 'No')
    df['Canned_Cranberry_Sauce'] = df['Cranberry_Sauce'].apply(lambda x: 'Yes' if x == 'Canned' else 'No')
    df['Smoked_Turkey'] = df['Main_Dish_Cooked'].apply(lambda x: 'Yes' if x == 'Smoked' else 'No')
    df['Rice_Stuffing'] = df['Stuffing'].apply(lambda x: 'Yes' if 'rice' in x.lower() else 'No')
    df['Roast_Turkey'] = df['Main_Dish_Cooked'].apply(lambda x: 'Yes' if x == 'Roasted' else 'No')
    
    northeast_df = df[df[northeast] == 'Yes']
    southern_df = df[df[southern] == 'Yes']
    flyover_df = df[df[midwest_and_plains] == 'Yes']
    west_df = df[df[west] == 'Yes']
    hawaiian_df = df[df[hawaii] == 'Yes']
    
    northeast_df['NE_Total'] = northeast_df.count(axis=1)
    southern_df['South_Total'] = southern_df.count(axis=1)
    flyover_df['MW_Total'] = flyover_df.count(axis=1)
    west_df['W_Total'] = west_df.count(axis=1)
    hawaiian_df['H_total'] = hawaiian_df.count(axis=1)
    df['NE_Total'] = northeast_df['NE_Total']
    df['South_Total'] = southern_df['South_Total']
    df['MW_Total'] = flyover_df['MW_Total']
    df['W_Total'] = west_df['W_Total']
    df['H_Total'] = hawaiian_df['H_total']
    totals = ['NE_Total', 'South_Total', 'MW_Total', 'W_Total', 'H_Total']
    df['Highest_Region'] = df[totals].idxmax(axis=1)
    df['Highest_Region'] = df['Highest_Region'].replace(['NE_Total', 'South_Total', 'MW_Total', 'W_Total', 'H_Total'],
                                                           ['New England', 'Southern', 'Middle America', 'West Coast', 'Hawaiian'])
    #removing data leaks
    df = df.drop(totals, axis=1)
    df = df.drop('US_Region', axis=1)
    return df


In [96]:
train, test = train_test_split(df)
train = wrangle(train)
test = wrangle(test)


In [97]:
train.columns

Index(['Main_Dish', 'Main_Dish_Cooked', 'Stuffing', 'Cranberry_Sauce', 'Gravy',
       'Brussel_Sprouts', 'Carrots', 'Cauliflower', 'Corn', 'Cornbread',
       'Fruit_Salad', 'Green_Beans', 'Mac_and_Cheese', 'Mashed_Potatoes',
       'Rolls_Biscuits', 'Squash', 'Salad', 'Sweet_Potatoes', 'Apple_Pie',
       'Buttermilk_Pie', 'Cherry_Pie', 'Chocolate_Pie', 'Coconut_Cream_Pie',
       'Key_Lime_Pie', 'Peach_Pie', 'Pecan_Pie', 'Pumpkin_Pie',
       'Sweet_Potato_Pie', 'No_Pie', 'Apple_Cobbler', 'Blondies', 'Brownies',
       'Carrot_Cake', 'Cheesecake', 'Cookies', 'Fudge', 'Ice_Cream',
       'Peach_Cobbler', 'No_Dessert', 'Prayer', 'Travel_Distance', 'Parade',
       'Kids_Table_Age', 'Old_Friends', 'Friendsgiving',
       'Black_Friday_Shopper', 'Retail_Worker', 'Black_Friday_Worker',
       'Neighborhood_Type', 'Age', 'Gender', 'Casserole', 'Meat_Pie',
       'Chess_Pie', 'Broccoli', 'Rice', 'Kids_Table', 'Fried_Turkey',
       'Homemade_Cranberry_Sauce', 'Cornbread_Stuffing',
       '

In [98]:
target = 'Highest_Region'

X_train = train.drop(target, axis=1)
y_train = train[target]
X_test = test.drop(target, axis=1)
y_test = test[target]

In [125]:
pipeline = make_pipeline(OrdinalEncoder(), RandomForestClassifier(n_jobs=-1))

In [126]:
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('ordinalencoder',
                 OrdinalEncoder(categories='auto',
                                dtype=<class 'numpy.float64'>)),
                ('randomforestclassifier',
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=10, n_jobs=-1,
                                        oob_score=False, random_state=None,
                                        verbose=0, warm_start=False))],
         verbo

In [102]:
def loocv(X_train, y_train, model):
    num_folds = 10
    num_instances = len(X_train)
    loocv = LeaveOneOut()
    results = cross_val_score(model, X_train, y_train, cv=loocv)
    print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))

In [103]:
loocv(X_train, y_train, pipeline)

Accuracy: 72.011% (44.895%)


In [105]:
y_train.value_counts(normalize=True)

Middle America    0.494565
New England       0.241848
Southern          0.236413
West Coast        0.027174
Name: Highest_Region, dtype: float64

In [108]:
pipeline_log = make_pipeline(OrdinalEncoder(), LogisticRegression())

In [109]:
pipeline_log.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('ordinalencoder',
                 OrdinalEncoder(categories='auto',
                                dtype=<class 'numpy.float64'>)),
                ('logisticregression',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='warn', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [110]:
loocv(X_train, y_train, pipeline_log)

Accuracy: 91.168% (28.375%)


In [112]:
import xgboost
model = xgboost.XGBClassifier(n_jobs=-1)

pipeline_xg = make_pipeline(OrdinalEncoder(), model)

pipeline_xg.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('ordinalencoder',
                 OrdinalEncoder(categories='auto',
                                dtype=<class 'numpy.float64'>)),
                ('xgbclassifier',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, gamma=0, learning_rate=0.1,
                               max_delta_step=0, max_depth=3,
                               min_child_weight=1, missing=None,
                               n_estimators=100, n_jobs=-1, nthread=None,
                               objective='multi:softprob', random_state=0,
                               reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                               seed=None, silent=None, subsample=1,
                               verbosity=1))],
         verbose=False)

In [113]:
loocv(X_train, y_train, pipeline_xg)

Accuracy: 86.957% (33.678%)


In [122]:
print(y_train.iloc[[30]])
pipeline_log.predict(X_train.iloc[[30]])

592    Southern
Name: Highest_Region, dtype: object


array(['Southern'], dtype=object)

In [174]:
encoder = OrdinalEncoder()
X_train_encoded = encoder.fit_transform(X_train)
X_train_encoded = pd.DataFrame(X_train_encoded, columns = X_train.columns)

In [176]:
import eli5
from eli5.sklearn import PermutationImportance
model = LogisticRegression()
model.fit(X_train_encoded, y_train)

perm = PermutationImportance(model).fit(X_train_encoded, y_train)
eli5.show_weights(perm)

Weight,Feature
0.1043  ± 0.0099,x26
0.1016  ± 0.0108,x11
0.0821  ± 0.0154,x58
0.0807  ± 0.0170,x18
0.0774  ± 0.0169,x8
0.0772  ± 0.0237,x14
0.0682  ± 0.0128,x12
0.0639  ± 0.0160,x15
0.0601  ± 0.0156,x1
0.0543  ± 0.0060,x5


In [179]:
X_train.iloc[:,58]

539     No
411     No
422     No
709    Yes
56     Yes
      ... 
894     No
133     No
105     No
305     No
115     No
Name: Homemade_Cranberry_Sauce, Length: 736, dtype: object