# Feature Engineering - Business Attributes

In [35]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [26]:
rev_busi_Pho= pd.read_csv('../data/filtered_reviews_in_Phonex.csv', parse_dates=["date"])
busi = pd.read_csv('../data/business_data_subset.csv')
busi.head(1)

Unnamed: 0,business_id,business_name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,CsLQLiRoafpJPJSkNX2h5Q,Middle East Deli,4508 E Independence Blvd,Charlotte,NC,28205,35.194894,-80.767442,3.0,5,0,"{'RestaurantsGoodForGroups': 'True', 'OutdoorS...","Food, Restaurants, Grocery, Middle Eastern",


### Change dict attribute to dummy variables

In [4]:
def convert_dict_into_dummy(data,feature):
    """
    First change feature values from str to dict, then create variables according to dict keys.
    return: dataframe with dict keys as columns
    """
    col_index = data.columns.get_loc(feature)
    get_dict = pd.Series(data.iloc[:,col_index].replace(np.nan,"None")).apply(eval)
    dummy_df = get_dict.replace("None",np.nan).replace("nan",np.nan).apply(pd.Series)
    return dummy_df

In [12]:
attr = convert_dict_into_dummy(busi,"attributes")
attr.head(1)

Unnamed: 0,RestaurantsGoodForGroups,OutdoorSeating,BusinessAcceptsCreditCards,RestaurantsTakeOut,GoodForKids,BusinessParking,RestaurantsPriceRange2,RestaurantsReservations,RestaurantsAttire,BikeParking,...,BYOBCorkage,CoatCheck,Corkage,BYOB,AgesAllowed,RestaurantsCounterService,DietaryRestrictions,Open24Hours,AcceptsInsurance,HairSpecializesIn
0,True,False,True,True,True,"{'garage': False, 'street': False, 'validated'...",1,False,u'casual',,...,,,,,,,,,,


In [13]:
ambience = convert_dict_into_dummy(attr,"Ambience")
ambience.dropna().head(1)

Unnamed: 0,romantic,intimate,touristy,hipster,divey,classy,trendy,upscale,casual
1,False,False,False,False,False,False,False,False,False


In [14]:
# change column name
ambience.columns = ["Ambience_"+i for i in ambience.columns.tolist()]
# concat
attr = pd.concat([attr.drop(['Ambience'],1),ambience],axis=1)

### GoodforMeal, latenights

In [10]:
goodformeal = convert_dict_into_dummy(attr,"GoodForMeal")
goodformeal.dropna().head(5)

Unnamed: 0,dessert,latenight,lunch,dinner,brunch,breakfast
6,False,False,True,True,False,False
9,False,False,False,False,False,False
13,False,False,True,False,True,False
14,False,False,True,False,False,False
15,False,False,False,False,False,False


### Change dict into boolean

In [9]:
def convert_dict_into_boolean(data,feature,new_name):
    """
    For some features that have many nan, but still have several values,
    convert it into boolean.
    """
    col_index = data.columns.get_loc(feature)
    data[new_name] = False
    for i in range(len(data)):
        if pd.isna(data.iloc[i,col_index]):
            continue
        elif "True" in  data.iloc[i,col_index]:
            data.loc[i,new_name] = True
    
    return data

In [15]:
attr = convert_dict_into_boolean(attr,"BusinessParking","Parking")
attr = attr.drop("BusinessParking",axis=1)

attr = convert_dict_into_boolean(attr,"Music","music")
attr = attr.drop("Music",axis=1)

### Hours

In [16]:
hours = convert_dict_into_dummy(busi,"hours")
hours.notnull().head()

Unnamed: 0,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday
0,False,False,False,False,False,False,False
1,True,True,True,True,True,True,True
2,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False
4,True,True,True,True,True,True,True


###  Concatenate to form final business features

In [17]:
bus_df = pd.concat([busi.drop(['attributes','hours'],1),attr,hours.notnull()],axis=1)
bus_df.head(1)

Unnamed: 0,business_id,business_name,address,city,state,postal_code,latitude,longitude,stars,review_count,...,Ambience_casual,Parking,music,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday
0,CsLQLiRoafpJPJSkNX2h5Q,Middle East Deli,4508 E Independence Blvd,Charlotte,NC,28205,35.194894,-80.767442,3.0,5,...,,False,False,False,False,False,False,False,False,False


### Data cleaning

In [18]:
def delete_u(data,feature):
    col_index = data.columns.get_loc(feature)
    values = data.iloc[:,col_index].value_counts().index
#     print(values)
    for i in values:
        if i == "None":
            data.iloc[:,col_index].replace("None",np.nan,inplace=True)
        else:
            data.iloc[:,col_index].replace(i,i.split("'")[1],inplace=True)
            # for Alcohol
            data.iloc[:,col_index].replace("none",np.nan,inplace=True)
    return data      

In [19]:
for feature in ["RestaurantsAttire","Alcohol","NoiseLevel","Smoking","WiFi"]:
#     print(feature)
    bus_df = delete_u(bus_df,feature)

In [20]:
bus_df.head(1)

Unnamed: 0,business_id,business_name,address,city,state,postal_code,latitude,longitude,stars,review_count,...,Ambience_casual,Parking,music,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday
0,CsLQLiRoafpJPJSkNX2h5Q,Middle East Deli,4508 E Independence Blvd,Charlotte,NC,28205,35.194894,-80.767442,3.0,5,...,,False,False,False,False,False,False,False,False,False


In [21]:
bus_df["RestaurantsAttire"].value_counts()

casual    18700
dressy      647
formal       32
Name: RestaurantsAttire, dtype: int64

### Drop non-related columns

In [22]:
bus_df = bus_df.drop(["DietaryRestrictions", "BYOB", "GoodForMeal", "AgesAllowed","Open24Hours","AcceptsInsurance",
                      "HairSpecializesIn","BYOBCorkage"],axis=1)

In [23]:
bus_df = bus_df.replace('True',True)
bus_df = bus_df.replace('False',False)
bus_df = bus_df.replace('None', np.nan)
bus_df = bus_df.replace('nan', np.nan)
bus_df.shape

(24401, 58)

### Keep restaurants in Phoenix

In [27]:
bus_df_subset = bus_df[bus_df.business_id.isin(rev_busi_Pho["business_id"].unique())]
bus_df_subset.shape

(1728, 58)

In [28]:
features_ind = bus_df_subset.columns.get_loc("RestaurantsGoodForGroups")
features = bus_df_subset.columns[features_ind:]
features
bus_df_subset = bus_df_subset.set_index("business_id").filter(features)

In [29]:
bus_df_subset.head(1)

Unnamed: 0_level_0,RestaurantsGoodForGroups,OutdoorSeating,BusinessAcceptsCreditCards,RestaurantsTakeOut,GoodForKids,RestaurantsPriceRange2,RestaurantsReservations,RestaurantsAttire,BikeParking,Alcohol,...,Ambience_casual,Parking,music,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
lpQziF9QCVZQRkxac1xzcw,True,False,True,True,False,2,True,casual,True,full_bar,...,True,True,False,True,True,True,True,True,True,True


### Impute Missing Values

In [30]:
bus_df_subset = bus_df_subset.fillna(False)
bus_df_subset = bus_df_subset * 1

### Correct object data types:

In [31]:
feature_dtypes = []
for i in features:
#     print(i)
    type_to_convert = type(bus_df_subset[i].iloc[0])
#     print(type_to_convert)
    bus_df_subset[i] = bus_df_subset[i].astype(type_to_convert)

In [32]:
## Drop columns that remain objects
col_index = bus_df_subset.columns[bus_df_subset.dtypes != "object"]
bus_df_subset = bus_df_subset[col_index]

In [33]:
bus_df_subset = busi[["business_id","latitude", "longitude", "stars", "review_count", "is_open"]].set_index("business_id").merge(\
                      bus_df_subset,  left_index = True, right_index = True )  

###  Standardize non-boolean variables

In [36]:
scaler = StandardScaler()
vars_to_scale = ["latitude", "longitude", "stars","review_count"]
bus_df_subset[vars_to_scale] = scaler.fit_transform(bus_df_subset[vars_to_scale])
bus_df_subset.to_csv("../data/business_subset_cleaned.csv")



In [37]:
bus_df_subset.head(4)

Unnamed: 0_level_0,latitude,longitude,stars,review_count,is_open,RestaurantsGoodForGroups,OutdoorSeating,BusinessAcceptsCreditCards,RestaurantsTakeOut,GoodForKids,...,Ambience_casual,Parking,music,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
lpQziF9QCVZQRkxac1xzcw,-0.155783,-0.475781,0.426501,-0.446236,1,1,0,1,1,0,...,1,1,0,1,1,1,1,1,1,1
Hgy5MrIKrwRJDXnwKaCpbA,-0.467969,1.270411,-1.23902,-0.391622,1,1,0,1,1,1,...,1,1,0,1,1,1,1,1,1,1
YOD9dXrnpu8HTRILpF0onw,1.373208,-0.249526,0.426501,0.147695,1,1,0,1,1,1,...,1,1,0,1,1,1,1,1,1,1
Wsb_rH2xNmMOdmzyb7eB7w,-0.800074,-0.515673,-0.40626,-0.534985,1,1,0,1,1,1,...,1,1,0,1,1,1,1,1,0,0
