# Preprocessing and Training Data Development¶

## 1-Imports

In [28]:
#Import pandas
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from library.sb_utils import save_file

## 2-Load the data


In [52]:
# Load the data contains orders, fulfilment centers, and meal info
food_orders = pd.read_csv('../Data/food_data_merged.csv')
#food_orders_cp = pd.read_csv('../Data/food_data_merged_cp.csv')

In [42]:
#use the info method to see a summary of the data
food_orders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 456548 entries, 0 to 456547
Data columns (total 15 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   id                     456548 non-null  int64  
 1   week                   456548 non-null  int64  
 2   center_id              456548 non-null  int64  
 3   meal_id                456548 non-null  int64  
 4   checkout_price         456548 non-null  float64
 5   base_price             456548 non-null  float64
 6   emailer_for_promotion  456548 non-null  int64  
 7   homepage_featured      456548 non-null  int64  
 8   num_orders             456548 non-null  int64  
 9   city_code              456548 non-null  int64  
 10  region_code            456548 non-null  int64  
 11  center_type            456548 non-null  object 
 12  op_area                456548 non-null  float64
 13  meal_category          456548 non-null  object 
 14  cuisine                456548 non-nu

In [43]:
#use the head method to see the first several rows of the data
food_orders.head()

Unnamed: 0,id,week,center_id,meal_id,checkout_price,base_price,emailer_for_promotion,homepage_featured,num_orders,city_code,region_code,center_type,op_area,meal_category,cuisine
0,1379560,1,55,1885,136.83,152.29,0,0,177,647,56,TYPE_C,2.0,Beverages,Thai
1,1018704,2,55,1885,135.83,152.29,0,0,323,647,56,TYPE_C,2.0,Beverages,Thai
2,1196273,3,55,1885,132.92,133.92,0,0,96,647,56,TYPE_C,2.0,Beverages,Thai
3,1116527,4,55,1885,135.86,134.86,0,0,163,647,56,TYPE_C,2.0,Beverages,Thai
4,1343872,5,55,1885,146.5,147.5,0,0,215,647,56,TYPE_C,2.0,Beverages,Thai


## 3-Feature Engineering

#### Dummy Encoding for Categorical Features

In [44]:
#selcting categorical fetures
cat_features=food_orders.select_dtypes('object').columns
cat_features

Index(['center_type', 'meal_category', 'cuisine'], dtype='object')

In [45]:
# Creating dummy features
food_orders_c=pd.get_dummies(food_orders,columns=cat_features,drop_first=True,dtype=int, prefix='C')
food_orders_c.head()

Unnamed: 0,id,week,center_id,meal_id,checkout_price,base_price,emailer_for_promotion,homepage_featured,num_orders,city_code,...,C_Pizza,C_Rice Bowl,C_Salad,C_Sandwich,C_Seafood,C_Soup,C_Starters,C_Indian,C_Italian,C_Thai
0,1379560,1,55,1885,136.83,152.29,0,0,177,647,...,0,0,0,0,0,0,0,0,0,1
1,1018704,2,55,1885,135.83,152.29,0,0,323,647,...,0,0,0,0,0,0,0,0,0,1
2,1196273,3,55,1885,132.92,133.92,0,0,96,647,...,0,0,0,0,0,0,0,0,0,1
3,1116527,4,55,1885,135.86,134.86,0,0,163,647,...,0,0,0,0,0,0,0,0,0,1
4,1343872,5,55,1885,146.5,147.5,0,0,215,647,...,0,0,0,0,0,0,0,0,0,1


In [46]:
food_orders_c.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 456548 entries, 0 to 456547
Data columns (total 30 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   id                     456548 non-null  int64  
 1   week                   456548 non-null  int64  
 2   center_id              456548 non-null  int64  
 3   meal_id                456548 non-null  int64  
 4   checkout_price         456548 non-null  float64
 5   base_price             456548 non-null  float64
 6   emailer_for_promotion  456548 non-null  int64  
 7   homepage_featured      456548 non-null  int64  
 8   num_orders             456548 non-null  int64  
 9   city_code              456548 non-null  int64  
 10  region_code            456548 non-null  int64  
 11  op_area                456548 non-null  float64
 12  C_TYPE_B               456548 non-null  int64  
 13  C_TYPE_C               456548 non-null  int64  
 14  C_Biryani              456548 non-nu

In [47]:
food_orders_c.shape

(456548, 30)

### 4-Standardize the magnitude of numeric features

In [48]:
# Extract features and target variable
 # Features
X = food_orders_c.drop(columns=['num_orders', 'id'])  
 # Target variable
y = food_orders_c['num_orders']  

# Standardize numerical features
numerical_features = ['checkout_price', 'base_price', 'op_area']
 # Making a Scaler object
scaler = StandardScaler()
 # Fitting data to the scaler object
X[numerical_features] = scaler.fit_transform(X[numerical_features])

### 5-Train/Test Split

In [49]:
#Split the data into testing and training datasets 
X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.3,random_state=47)

In [50]:
X_train.shape, X_test.shape

((319583, 28), (136965, 28))

In [51]:
y_train.shape, y_test.shape

((319583,), (136965,))

### Summary

The categorical features have been identified, and dummy features were generated to enable their utilization in machine learning models. 

Furthermore, the numeric features ('checkout_price', 'base_price', 'op_area') underwent standardization. 

Additionally, in order to assess model performance effectively, the dataset was divided into training and testing sets.