# Preprocessing and Training Data Development¶

## 1-Imports

In [55]:
#Import pandas
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from library.sb_utils import save_file
from sklearn.model_selection import TimeSeriesSplit

## 2-Load the data


In [56]:
# Load the data contains orders, fulfilment centers, and meal info
food_orders = pd.read_csv('../Data/food_data_merged.csv')
#food_orders_cp = pd.read_csv('../Data/food_data_merged_cp.csv')

In [57]:
#use the info method to see a summary of the data
food_orders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 423727 entries, 0 to 423726
Data columns (total 15 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   id                     423727 non-null  int64  
 1   week                   423727 non-null  int64  
 2   center_id              423727 non-null  int64  
 3   meal_id                423727 non-null  int64  
 4   checkout_price         423727 non-null  float64
 5   base_price             423727 non-null  float64
 6   emailer_for_promotion  423727 non-null  int64  
 7   homepage_featured      423727 non-null  int64  
 8   num_orders             423727 non-null  int64  
 9   city_code              423727 non-null  int64  
 10  region_code            423727 non-null  int64  
 11  center_type            423727 non-null  object 
 12  op_area                423727 non-null  float64
 13  meal_category          423727 non-null  object 
 14  cuisine                423727 non-nu

In [58]:
#use the head method to see the first several rows of the data
food_orders.head()

Unnamed: 0,id,week,center_id,meal_id,checkout_price,base_price,emailer_for_promotion,homepage_featured,num_orders,city_code,region_code,center_type,op_area,meal_category,cuisine
0,1379560,1,55,1885,136.83,152.29,0,0,177,647,56,TYPE_C,2.0,Beverages,Thai
1,1044078,1,39,2290,301.73,299.73,0,0,419,526,34,TYPE_C,3.8,Rice Bowl,Indian
2,1272063,1,65,2290,305.58,306.58,0,0,878,602,34,TYPE_A,4.8,Rice Bowl,Indian
3,1364972,1,43,2290,279.39,279.39,0,0,944,590,56,TYPE_A,5.1,Rice Bowl,Indian
4,1363806,1,80,2290,311.43,311.43,0,0,446,604,56,TYPE_C,5.1,Rice Bowl,Indian


## 3-Feature Engineering

#### Dummy Encoding for Categorical Features

In [59]:
#selcting categorical fetures
cat_features=food_orders.select_dtypes('object').columns
cat_features

Index(['center_type', 'meal_category', 'cuisine'], dtype='object')

In [70]:
# Creating dummy features
food_orders_c=pd.get_dummies(food_orders,columns=cat_features,drop_first=True,dtype=int, prefix='C')
food_orders_c.head()

Unnamed: 0,id,week,center_id,meal_id,checkout_price,base_price,emailer_for_promotion,homepage_featured,num_orders,city_code,...,C_Pizza,C_Rice Bowl,C_Salad,C_Sandwich,C_Seafood,C_Soup,C_Starters,C_Indian,C_Italian,C_Thai
0,1379560,1,55,1885,136.83,152.29,0,0,177,647,...,0,0,0,0,0,0,0,0,0,1
1,1044078,1,39,2290,301.73,299.73,0,0,419,526,...,0,1,0,0,0,0,0,1,0,0
2,1272063,1,65,2290,305.58,306.58,0,0,878,602,...,0,1,0,0,0,0,0,1,0,0
3,1364972,1,43,2290,279.39,279.39,0,0,944,590,...,0,1,0,0,0,0,0,1,0,0
4,1363806,1,80,2290,311.43,311.43,0,0,446,604,...,0,1,0,0,0,0,0,1,0,0


In [61]:
food_orders_c.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 423727 entries, 0 to 423726
Data columns (total 30 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   id                     423727 non-null  int64  
 1   week                   423727 non-null  int64  
 2   center_id              423727 non-null  int64  
 3   meal_id                423727 non-null  int64  
 4   checkout_price         423727 non-null  float64
 5   base_price             423727 non-null  float64
 6   emailer_for_promotion  423727 non-null  int64  
 7   homepage_featured      423727 non-null  int64  
 8   num_orders             423727 non-null  int64  
 9   city_code              423727 non-null  int64  
 10  region_code            423727 non-null  int64  
 11  op_area                423727 non-null  float64
 12  C_TYPE_B               423727 non-null  int64  
 13  C_TYPE_C               423727 non-null  int64  
 14  C_Biryani              423727 non-nu

In [62]:
food_orders_c.shape

(423727, 30)

### 4-Standardize the magnitude of numeric features

In [71]:
# Extract features and target variable
 # Features
X = food_orders_c.drop(columns=['num_orders', 'id'])  
 # Target variable
y = food_orders_c['num_orders']  

# Standardize numerical features
numerical_features = ['checkout_price', 'base_price', 'op_area']
 # Making a Scaler object
scaler = StandardScaler()
 # Fitting data to the scaler object
X[numerical_features] = scaler.fit_transform(X[numerical_features])

### 5-Train/Test Split

When working with time-series data, the sequential order of data points holds significance. Unlike traditional machine learning models, where a random train-test split is common, time-series data requires a sequential split to maintain the temporal order of observations. This consideration arises from the inherent dependence of future data points on past ones in time series data. Random splits can potentially lead to using future information to predict past events, which is inappropriate.

To address this, we adopt a sequential split for time series, dividing it into training and test sets based on a specified proportion of the data. A custom function is defined to facilitate this, allowing us to choose the percentage of data for the test set while using the remainder for the training set.

In [72]:
#Split the data into testing and training datasets 

def trian_test_split_T (x,y, testsize=0.3):
   
    split=int(len(x)*(1-testsize))
    X_train=x.iloc[:split]
    y_train=(y.to_frame()).iloc[:split]
    X_test=x.iloc[split:]
    y_test=(y.to_frame()).iloc[split:]
    return X_train, X_test, y_train, y_test

In [73]:
X_train, X_test, y_train, y_test=trian_test_split_T (X,y)

In [74]:
X_train.shape, X_test.shape

((296608, 28), (127119, 28))

In [75]:
y_train.shape, y_test.shape

((296608, 1), (127119, 1))

In [77]:
# save the model

path = '../data'
save_file(food_orders_c, 'food_data_merged_c.csv', path)
save_file(X_train, 'X_train.csv', path)
save_file(X_test, 'X_test.csv', path)
save_file(y_train, 'y_train.csv', path)
save_file(y_test, 'y_test.csv', path)

A file already exists with this name.



Do you want to overwrite? (Y/N) y


Writing file.  "../data/food_data_merged_c.csv"
A file already exists with this name.



Do you want to overwrite? (Y/N) y


Writing file.  "../data/X_train.csv"
A file already exists with this name.



Do you want to overwrite? (Y/N) y


Writing file.  "../data/X_test.csv"
A file already exists with this name.



Do you want to overwrite? (Y/N) y


Writing file.  "../data/y_train.csv"
A file already exists with this name.



Do you want to overwrite? (Y/N) y


Writing file.  "../data/y_test.csv"


### Summary

The categorical features have been identified, and dummy features were generated to enable their utilization in machine learning models. 

Furthermore, the numeric features ('checkout_price', 'base_price', 'op_area') underwent standardization. 

Additionally, in order to assess model performance effectively, the dataset was divided into training and testing sets.