### Load Modules

In [1]:
import pandas as pd 
import numpy as np
import pickle
from matplotlib import pyplot as plt
import seaborn as sns
import time

### Load Data 

In [2]:
#### Loading starbucks files
df_raw = pd.read_csv("../Data/Raw/starbucks_dataset_raw.csv")

### Cleaning

In [3]:
#### drop time for cluster analysis  (short period)
df_raw.drop(['Timestamp'], axis = 1, inplace = True)

In [4]:
### renaming taxonomy
df_raw.columns = ['GENDER', 'AGE',
       'EMPLOYMENT', 'INCOME',
       'FREQUENCY_VISIT',
       'HOW_DO_YOU_ENJOY_STARBUCKS',
       'TIME_PER_VISIT',
       'DISTANCE_TO_NEAREST_STORE',
       'MEMBER',
       'PRODUCT_CATEGORY',
       'SPEND_PER_VISIT',
       'QUALITY_EV',
       'PRICE_EV',
       'PROMOTIONS_EV',
       'AMBIANCE_EV',
       'WIFI_EV',
       'SERVICE_EV',
       'BUSINESS_OR_FRIENDS',
       'PROMOTIONS_KNOW',
       'POTENTIAL_CLIENT']

#### CLEANING MULTIPLE ANSWER QUESTIONS (categories)

In [6]:
#### product category
df_raw['PRODUCT_CATEGORY'].unique()

array(['Coffee', 'Cold drinks;Pastries', 'Coffee;Sandwiches',
       'Cold drinks', 'Coffee;Cold drinks',
       'Cold drinks;Pastries;Sandwiches',
       'Coffee;Juices;Pastries;Sandwiches', 'Coffee;Pastries;Sandwiches',
       'Coffee;Pastries', 'Cold drinks;Juices;Pastries',
       'Coffee;Cold drinks;Pastries;Sandwiches', 'Never', 'Never buy any',
       'Jaws chip ', 'cake ', 'Pastries', 'Cold drinks;Never', 'never',
       'Nothing ', 'Coffee;Cold drinks;Juices;Pastries;Sandwiches'],
      dtype=object)

In [7]:
df_raw['PRODUCT_CATEGORY'] = df_raw['PRODUCT_CATEGORY'].apply(lambda x: x.replace(" ", "")) ## remove spaces
df_raw['PRODUCT_CATEGORY'] = df_raw['PRODUCT_CATEGORY'].apply(lambda x: x.lower()) ## lower case
df_raw.loc[df_raw['PRODUCT_CATEGORY'].str.contains('never'), 'PRODUCT_CATEGORY'] = 'never'
df_raw.loc[df_raw['PRODUCT_CATEGORY'].str.contains('noth'), 'PRODUCT_CATEGORY'] = 'never'
### Passing results that have a negative meaning to a special 

In [8]:
 df_raw['PRODUCT_CATEGORY'].str.get_dummies(sep=';').columns

Index(['cake', 'coffee', 'colddrinks', 'jawschip', 'juices', 'never',
       'pastries', 'sandwiches'],
      dtype='object')

In [9]:
df_raw[['FG_cake', 'FG_coffee', 'FG_colddrinks', 'FG_jawschip', 'FG_juices', 'FG_never',
       'FG_pastries', 'FG_sandwiches']] = df_raw['PRODUCT_CATEGORY'].str.get_dummies(sep=';')

In [10]:
df_raw.drop('PRODUCT_CATEGORY', axis = 1, inplace = True)

In [11]:
df_raw['PROMOTIONS_KNOW']  = df_raw['PROMOTIONS_KNOW'].astype(str)

In [12]:
#### BUILDING A DUMMY IF ANSWER CONTAINS
## SOCIAL_MEDIA, SITE and starbucks, EMAIL,FRIENDS, STORE, BILLBOARDS, site
df_raw['PROMOTIONS_KNOW'] = df_raw['PROMOTIONS_KNOW'].apply(lambda x: x.replace(" ", "")) ## remove spaces
df_raw['PROMOTIONS_KNOW'] = df_raw['PROMOTIONS_KNOW'].apply(lambda x: x.lower()) ## lower case
df_raw['FG_DIGITAL_MEDIA'] = ((df_raw['PROMOTIONS_KNOW'].str.contains('social'))  | (df_raw['PROMOTIONS_KNOW'].str.contains('deal'))).astype(int)
df_raw['FG_STARBUCKS_WEBSITE'] = ((df_raw['PROMOTIONS_KNOW'].str.contains('site'))| (df_raw['PROMOTIONS_KNOW'].str.contains('app')) & (df_raw['PROMOTIONS_KNOW'].str.contains('starb'))).astype(int)
df_raw['FG_EMAIL'] = (df_raw['PROMOTIONS_KNOW'].str.contains('email')).astype(int)
df_raw['FG_FRIENDS'] = (df_raw['PROMOTIONS_KNOW'].str.contains('friend')).astype(int)
df_raw['FG_FISIC'] = ((df_raw['PROMOTIONS_KNOW'].str.contains('store'))|(df_raw['PROMOTIONS_KNOW'].str.contains('billboards'))).astype(int)

In [13]:
df_raw.drop(['PROMOTIONS_KNOW'], axis = 1, inplace = True)

#### Cleaning NAs

In [17]:
cols_num = ['QUALITY_EV', 'PRICE_EV', 'PROMOTIONS_EV',
       'AMBIANCE_EV', 'WIFI_EV', 'SERVICE_EV', 'BUSINESS_OR_FRIENDS']

In [18]:
df_raw[cols_num] = df_raw[cols_num].apply(lambda x: x.fillna(x.median())) 

In [19]:
fgs = ['FG_cake', 'FG_coffee', 'FG_colddrinks',
       'FG_jawschip', 'FG_juices', 'FG_never', 'FG_pastries', 'FG_sandwiches',
       'FG_DIGITAL_MEDIA', 'FG_STARBUCKS_WEBSITE', 'FG_EMAIL', 'FG_FRIENDS',
       'FG_FISIC']

In [20]:
df_raw[fgs] = df_raw[fgs].replace(np.nan,0 )

In [21]:
cols_cat = ['GENDER', 'AGE', 'EMPLOYMENT', 'INCOME', 'FREQUENCY_VISIT',
       'HOW_DO_YOU_ENJOY_STARBUCKS', 'TIME_PER_VISIT',
       'DISTANCE_TO_NEAREST_STORE', 'MEMBER',
       'SPEND_PER_VISIT']

In [22]:
df_raw[cols_cat] = df_raw[cols_cat].replace(np.nan,"Not Answered")

### Exporting to CSV

In [23]:
df_raw.to_csv("../Data/Cleaned/df.csv", index = False)