# Capstone Two - Pre-processing and Training Data Development

### To complete this step, you'll do the following:

- Create dummy or indicator features for categorical variables
- Standardize the magnitude of numeric features using a scaler
- Split your data into testing and training datasets

### Creation of indicator features for categorical variables

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np

In [2]:
#Read the csv file from the Data Wrangling step
df = pd.read_csv('walmart_sales-Copy1.csv', index_col = 0)

In [3]:
#See how the df looks like
df.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,CPI,Unemployment,Close,Type,Size
0,1,1,2010-02-05,24924.5,False,42.31,2.572,211.096358,8.106,53.450001,A,151315
1,1,2,2010-02-05,50605.27,False,42.31,2.572,211.096358,8.106,53.450001,A,151315
2,1,3,2010-02-05,13740.12,False,42.31,2.572,211.096358,8.106,53.450001,A,151315
3,1,4,2010-02-05,39954.04,False,42.31,2.572,211.096358,8.106,53.450001,A,151315
4,1,5,2010-02-05,32229.38,False,42.31,2.572,211.096358,8.106,53.450001,A,151315


In [4]:
#See value counts per store 
df['Store'].value_counts(ascending = False)

13    10178
10    10024
4      9984
1      9956
2      9949
24     9941
27     9938
34     9935
20     9928
6      9923
32     9916
19     9861
31     9857
28     9827
41     9807
11     9780
23     9769
14     9759
40     9733
15     9620
8      9617
39     9600
17     9586
18     9582
26     9577
25     9532
7      9487
12     9434
22     9415
45     9366
21     9314
35     9262
29     9189
16     9180
3      8782
5      8745
9      8620
38     7157
37     7000
44     6971
30     6951
42     6757
43     6563
33     6307
36     6048
Name: Store, dtype: int64

In [5]:
#See the value counts per department
pd.set_option('max_rows', None)
pd.DataFrame(df['Dept'].value_counts(ascending = False))

Unnamed: 0,Dept
1,6255
10,6255
38,6255
21,6255
67,6255
16,6255
14,6255
13,6255
79,6255
81,6255


In [6]:
#Work with the categorical features
df_dummies = df[['Date','Weekly_Sales','IsHoliday','Store', 'Dept', 'Type']]

In [7]:
# Convert the df into dummies variable using pandas get dummies function
df_dummies = pd.get_dummies(df_dummies, columns = ['IsHoliday','Store', 'Dept', 'Type'])

In [8]:
#See how the data frame with categorial features looks like
df_dummies.head()

Unnamed: 0,Date,Weekly_Sales,IsHoliday_False,IsHoliday_True,Store_1,Store_2,Store_3,Store_4,Store_5,Store_6,...,Dept_93,Dept_94,Dept_95,Dept_96,Dept_97,Dept_98,Dept_99,Type_A,Type_B,Type_C
0,2010-02-05,24924.5,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,2010-02-05,50605.27,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,2010-02-05,13740.12,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,2010-02-05,39954.04,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,2010-02-05,32229.38,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [9]:
#Specify the df for just the categorical features for future concating
df_dummies = df_dummies.drop(columns = {'Date', 'Weekly_Sales'}, axis = 1)

In [10]:
#Specify the sales and the date data frame for future concating
df_sales = df['Weekly_Sales']
df_date = df['Date']

### Standardize the magnitude of numeric features using a scaler

In [11]:
#Extract the numeric features from the main data frame
df_numeric = df[['Date', 'Weekly_Sales', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'Close']]

In [12]:
#See first rows of the data
df_numeric.head()

Unnamed: 0,Date,Weekly_Sales,Temperature,Fuel_Price,CPI,Unemployment,Close
0,2010-02-05,24924.5,42.31,2.572,211.096358,8.106,53.450001
1,2010-02-05,50605.27,42.31,2.572,211.096358,8.106,53.450001
2,2010-02-05,13740.12,42.31,2.572,211.096358,8.106,53.450001
3,2010-02-05,39954.04,42.31,2.572,211.096358,8.106,53.450001
4,2010-02-05,32229.38,42.31,2.572,211.096358,8.106,53.450001


In [13]:
#Import necessary libraries
from sklearn.preprocessing import StandardScaler

In [14]:
#Create X and y variables
X = df_numeric.drop(columns = ['Weekly_Sales', 'Date'])
y = df_numeric['Weekly_Sales']

In [15]:
# Making a Scaler object
scaler = StandardScaler()
# Fitting data to the scaler object
scaler.fit(X)
#Transform the data
scaler_transformed = scaler.transform(X)

In [16]:
#Create a data frame with the scaled data
df_scaled = pd.DataFrame(scaler_transformed, columns = ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'Close'])

In [17]:
#See how the scaled function looks like
df_scaled.head()

Unnamed: 0,Temperature,Fuel_Price,CPI,Unemployment,Close
0,-0.973901,-1.724658,1.017898,0.080282,-0.578806
1,-0.973901,-1.724658,1.017898,0.080282,-0.578806
2,-0.973901,-1.724658,1.017898,0.080282,-0.578806
3,-0.973901,-1.724658,1.017898,0.080282,-0.578806
4,-0.973901,-1.724658,1.017898,0.080282,-0.578806


In [18]:
#Time to concat the scaled data frame with the date and the categorical features
df = pd.concat([df_date, df_sales, df_scaled, df_dummies], axis = 1)

In [19]:
#See how the final data frame looks like
df.head()

Unnamed: 0,Date,Weekly_Sales,Temperature,Fuel_Price,CPI,Unemployment,Close,IsHoliday_False,IsHoliday_True,Store_1,...,Dept_93,Dept_94,Dept_95,Dept_96,Dept_97,Dept_98,Dept_99,Type_A,Type_B,Type_C
0,2010-02-05,24924.5,-0.973901,-1.724658,1.017898,0.080282,-0.578806,1,0,1,...,0,0,0,0,0,0,0,1,0,0
1,2010-02-05,50605.27,-0.973901,-1.724658,1.017898,0.080282,-0.578806,1,0,1,...,0,0,0,0,0,0,0,1,0,0
2,2010-02-05,13740.12,-0.973901,-1.724658,1.017898,0.080282,-0.578806,1,0,1,...,0,0,0,0,0,0,0,1,0,0
3,2010-02-05,39954.04,-0.973901,-1.724658,1.017898,0.080282,-0.578806,1,0,1,...,0,0,0,0,0,0,0,1,0,0
4,2010-02-05,32229.38,-0.973901,-1.724658,1.017898,0.080282,-0.578806,1,0,1,...,0,0,0,0,0,0,0,1,0,0


### Split your data into testing and training datasets

In [20]:
#Import necessary libraries
from sklearn.model_selection import train_test_split

In [21]:
#Define the X and the Y variables for the numeric variables
X = df.drop(columns = {'Date','Weekly_Sales'}).values
y = df[['Date', 'Weekly_Sales']].values

In [22]:
#Divide into train and test data
X_train = X[:int(X.shape[0]*0.75)]
y_train = y[:int(y.shape[0]*0.75)]
X_test = X[int(X.shape[0]*0.75):]
y_test = y[int(y.shape[0]*0.75):]

In [23]:
#Save the column names
column_names = df.drop(columns = {'Date', 'Weekly_Sales'}).columns
y_columns = df[['Date', 'Weekly_Sales']].columns

In [24]:
#Creation of the train data frame
df_train1 = pd.DataFrame(X_train, columns = column_names)
df_train2 = pd.DataFrame(y_train, columns = y_columns)
df_train = pd.concat([df_train2, df_train1], axis = 1)

In [25]:
#See the first 5 rows of our new train data frame
df_train.head()

Unnamed: 0,Date,Weekly_Sales,Temperature,Fuel_Price,CPI,Unemployment,Close,IsHoliday_False,IsHoliday_True,Store_1,...,Dept_93,Dept_94,Dept_95,Dept_96,Dept_97,Dept_98,Dept_99,Type_A,Type_B,Type_C
0,2010-02-05,24924.5,-0.973901,-1.724658,1.017898,0.080282,-0.578806,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,2010-02-05,50605.3,-0.973901,-1.724658,1.017898,0.080282,-0.578806,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,2010-02-05,13740.1,-0.973901,-1.724658,1.017898,0.080282,-0.578806,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,2010-02-05,39954.0,-0.973901,-1.724658,1.017898,0.080282,-0.578806,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,2010-02-05,32229.4,-0.973901,-1.724658,1.017898,0.080282,-0.578806,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [26]:
#Convert the Date column into date type and the weekly sales into float numbers
df_train['Date'] = pd.to_datetime(df_train['Date'])
df_train['Weekly_Sales'] = df_train['Weekly_Sales'].astype(float)

In [27]:
#Create a new data frame to convert the time into categorical variables
df_date = pd.DataFrame()

In [28]:
#Convert to the year
df_date['Year'] = pd.Series(df_train['Date'])
df_date['Year'] = pd.DatetimeIndex(df_date['Year']).year

In [29]:
#Convert to the month
df_date['Month'] = pd.Series(df_train['Date'])
df_date['Month'] = pd.DatetimeIndex(df_date['Month']).month

In [30]:
#Convert to the day
df_date['Day'] = pd.Series(df_train['Date'])
df_date['Day'] = pd.DatetimeIndex(df_date['Day']).day

In [31]:
#Concat the train data frame with the categorical data frame 
df_train = pd.concat([df_date, df_train], axis = 1)

In [32]:
#Drop the date column in order to avoid redundancy
df_train.drop('Date', axis = 1, inplace = True)

In [33]:
#See the final data frame
df_train.head()

Unnamed: 0,Year,Month,Day,Weekly_Sales,Temperature,Fuel_Price,CPI,Unemployment,Close,IsHoliday_False,...,Dept_93,Dept_94,Dept_95,Dept_96,Dept_97,Dept_98,Dept_99,Type_A,Type_B,Type_C
0,2010,2,5,24924.5,-0.973901,-1.724658,1.017898,0.080282,-0.578806,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,2010,2,5,50605.27,-0.973901,-1.724658,1.017898,0.080282,-0.578806,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,2010,2,5,13740.12,-0.973901,-1.724658,1.017898,0.080282,-0.578806,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,2010,2,5,39954.04,-0.973901,-1.724658,1.017898,0.080282,-0.578806,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,2010,2,5,32229.38,-0.973901,-1.724658,1.017898,0.080282,-0.578806,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [34]:
# Convert train to csv
df_train.to_csv('WalmartSalesTrainDataset.csv')

In [35]:
# Creation of the test data frame 
df_test1 = pd.DataFrame(X_test, columns = column_names)
df_test2 = pd.DataFrame(y_test, columns = y_columns)
df_test = pd.concat([df_test2, df_test1], axis = 1)

In [36]:
#See first 5 rows
df_test.head()

Unnamed: 0,Date,Weekly_Sales,Temperature,Fuel_Price,CPI,Unemployment,Close,IsHoliday_False,IsHoliday_True,Store_1,...,Dept_93,Dept_94,Dept_95,Dept_96,Dept_97,Dept_98,Dept_99,Type_A,Type_B,Type_C
0,2011-01-14,10705.5,-0.361915,-0.103728,-1.121517,0.533893,-0.391527,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,2011-01-14,9349.8,-0.361915,-0.103728,-1.121517,0.533893,-0.391527,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,2011-01-14,9161.96,-0.361915,-0.103728,-1.121517,0.533893,-0.391527,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,2011-01-14,3155.9,-0.361915,-0.103728,-1.121517,0.533893,-0.391527,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,2011-01-14,2444.72,-0.361915,-0.103728,-1.121517,0.533893,-0.391527,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [37]:
#Repetition of the process that is above but instead of training, we would be using the test data
#Convert date column into Date time and weekly sales into float
df_test['Date'] = pd.to_datetime(df_test['Date'])
df_test['Weekly_Sales'] = df_test['Weekly_Sales'].astype(float)

In [38]:
#Creation of the new data frame
df_date = pd.DataFrame()

In [39]:
#Year column
df_date['Year'] = pd.Series(df_test['Date'])
df_date['Year'] = pd.DatetimeIndex(df_date['Year']).year

In [40]:
#Month column
df_date['Month'] = pd.Series(df_test['Date'])
df_date['Month'] = pd.DatetimeIndex(df_date['Month']).month

In [41]:
#Day column
df_date['Day'] = pd.Series(df_test['Date'])
df_date['Day'] = pd.DatetimeIndex(df_date['Day']).day

In [42]:
#Concat the date data frame and the test data
df_test = pd.concat([df_date, df_test], axis = 1)

In [43]:
#Drop test to avoid redundancy
df_test.drop('Date', axis = 1, inplace = True)

In [44]:
#See how the test data looks like
df_test.head()

Unnamed: 0,Year,Month,Day,Weekly_Sales,Temperature,Fuel_Price,CPI,Unemployment,Close,IsHoliday_False,...,Dept_93,Dept_94,Dept_95,Dept_96,Dept_97,Dept_98,Dept_99,Type_A,Type_B,Type_C
0,2011,1,14,10705.49,-0.361915,-0.103728,-1.121517,0.533893,-0.391527,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,2011,1,14,9349.8,-0.361915,-0.103728,-1.121517,0.533893,-0.391527,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,2011,1,14,9161.96,-0.361915,-0.103728,-1.121517,0.533893,-0.391527,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,2011,1,14,3155.9,-0.361915,-0.103728,-1.121517,0.533893,-0.391527,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,2011,1,14,2444.72,-0.361915,-0.103728,-1.121517,0.533893,-0.391527,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [45]:
# Convert test to csv
df_test.to_csv('WalmartSalesTestDataset.csv')

### Questions

1. Does my data set have any categorical data, such as Gender or day of the week? 

The dataset I am going to work with does have many categorical features. Department's number, type of store, if it is whether holiday or not, and store's number

2. Do my features have data values that range from 0 - 100 or 0-1 or both and more? 

The dataset I'm working has different ranges. Before scaling the data, the values were between 0 and 100, and after scaling the data, we are having negative and positive values.