## Data Preprocessing

#### Import Libraries

In [1]:
import pandas as pd
import numpy as np

#### Import Dataset

In [2]:
path = "../Dataset/train/train.csv"
raw_data = pd.read_csv(path, nrows = 10000) 
raw_data

Unnamed: 0,id,date,store_nbr,item_nbr,unit_sales,onpromotion
0,0,2013-01-01,25,103665,7.0,
1,1,2013-01-01,25,105574,1.0,
2,2,2013-01-01,25,105575,2.0,
3,3,2013-01-01,25,108079,1.0,
4,4,2013-01-01,25,108701,1.0,
...,...,...,...,...,...,...
9995,9995,2013-01-02,9,698643,20.0,
9996,9996,2013-01-02,9,716241,5.0,
9997,9997,2013-01-02,9,716242,12.0,
9998,9998,2013-01-02,9,716245,7.0,


In [3]:
raw_data.head()

Unnamed: 0,id,date,store_nbr,item_nbr,unit_sales,onpromotion
0,0,2013-01-01,25,103665,7.0,
1,1,2013-01-01,25,105574,1.0,
2,2,2013-01-01,25,105575,2.0,
3,3,2013-01-01,25,108079,1.0,
4,4,2013-01-01,25,108701,1.0,


In [4]:
df = raw_data.copy()

In [5]:
df = df.drop(["onpromotion", "id"], axis = 1)

#### Date to Weekday

In [6]:
df["date"] = pd.to_datetime(df["date"], format = "%Y/%m/%d")
def weekday(date) :
    return date.weekday()
df["Weekday"] = df["date"].apply(weekday)

In [7]:
df.head()

Unnamed: 0,date,store_nbr,item_nbr,unit_sales,Weekday
0,2013-01-01,25,103665,7.0,1
1,2013-01-01,25,105574,1.0,1
2,2013-01-01,25,105575,2.0,1
3,2013-01-01,25,108079,1.0,1
4,2013-01-01,25,108701,1.0,1


In [8]:
df = df.drop("date", axis = 1)

In [9]:
store_df = pd.read_csv("../Dataset/stores.csv")

In [10]:
city = []
for no in df["store_nbr"] :
    city.append(store_df[store_df.store_nbr == no].city[no-1])
df["city"] = city

In [11]:
df = df.drop("store_nbr", axis = 1)

In [12]:
class_data = pd.read_csv("../Dataset/items.csv")

In [13]:
class_data = class_data.drop(["class", "perishable"], axis = 1)

In [14]:
type_val = []
for no in df["item_nbr"] :
    type_val.append( (class_data[ class_data.item_nbr == no ].family).values[0])

In [15]:
df["family"] = type_val

In [16]:
df.head()

Unnamed: 0,item_nbr,unit_sales,Weekday,city,family
0,103665,7.0,1,Salinas,BREAD/BAKERY
1,105574,1.0,1,Salinas,GROCERY I
2,105575,2.0,1,Salinas,GROCERY I
3,108079,1.0,1,Salinas,GROCERY I
4,108701,1.0,1,Salinas,DELI


In [17]:
df = df.drop("item_nbr", axis = 1)

In [18]:
df.head()

Unnamed: 0,unit_sales,Weekday,city,family
0,7.0,1,Salinas,BREAD/BAKERY
1,1.0,1,Salinas,GROCERY I
2,2.0,1,Salinas,GROCERY I
3,1.0,1,Salinas,GROCERY I
4,1.0,1,Salinas,DELI


In [19]:
family_dummies = pd.get_dummies(df["family"])

In [20]:
df = pd.concat([df, family_dummies], axis = 1)

In [21]:
df = df.drop(["AUTOMOTIVE", "family"], axis = 1)

In [22]:
city_dummies = pd.get_dummies(df["city"])

In [23]:
df = pd.concat([df, city_dummies], axis = 1)

In [24]:
df = df.drop(["Quito", "city"], axis = 1)

In [25]:
df.columns.values

array(['unit_sales', 'Weekday', 'BEAUTY', 'BEVERAGES', 'BREAD/BAKERY',
       'CLEANING', 'DAIRY', 'DELI', 'EGGS', 'FROZEN FOODS', 'GROCERY I',
       'GROCERY II', 'HARDWARE', 'HOME APPLIANCES', 'LAWN AND GARDEN',
       'LINGERIE', 'LIQUOR,WINE,BEER', 'MEATS', 'PERSONAL CARE',
       'POULTRY', 'PREPARED FOODS', 'SEAFOOD', 'Salinas', 'Santo Domingo'],
      dtype=object)

In [26]:
reorder_col = [ 'Weekday', 'BEAUTY', 'BEVERAGES', 'BREAD/BAKERY',
       'CLEANING', 'DAIRY', 'DELI', 'EGGS', 'FROZEN FOODS', 'GROCERY I',
       'GROCERY II', 'HARDWARE', 'HOME APPLIANCES', 'LAWN AND GARDEN',
       'LINGERIE', 'LIQUOR,WINE,BEER', 'MEATS', 'PERSONAL CARE',
       'POULTRY', 'PREPARED FOODS', 'SEAFOOD', 'Salinas', 'Santo Domingo', 'unit_sales']
df =  df[reorder_col]

In [27]:
df.head()

Unnamed: 0,Weekday,BEAUTY,BEVERAGES,BREAD/BAKERY,CLEANING,DAIRY,DELI,EGGS,FROZEN FOODS,GROCERY I,...,LINGERIE,"LIQUOR,WINE,BEER",MEATS,PERSONAL CARE,POULTRY,PREPARED FOODS,SEAFOOD,Salinas,Santo Domingo,unit_sales
0,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,7.0
1,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,1.0
2,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,2.0
3,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,1.0
4,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,1.0


#### Regresion

In [28]:
dataset = df.copy()
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]

In [29]:
X

Unnamed: 0,Weekday,BEAUTY,BEVERAGES,BREAD/BAKERY,CLEANING,DAIRY,DELI,EGGS,FROZEN FOODS,GROCERY I,...,LAWN AND GARDEN,LINGERIE,"LIQUOR,WINE,BEER",MEATS,PERSONAL CARE,POULTRY,PREPARED FOODS,SEAFOOD,Salinas,Santo Domingo
0,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
2,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
3,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
4,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,2,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
9996,2,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9997,2,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9998,2,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
y

0        7.0
1        1.0
2        2.0
3        1.0
4        1.0
        ... 
9995    20.0
9996     5.0
9997    12.0
9998     7.0
9999    10.0
Name: unit_sales, Length: 10000, dtype: float64

#### Regression

In [31]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [32]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [33]:
regressor.score(X_train, y_train)

0.07243393797913789

In [34]:
regressor.score(X_test, y_test)

0.07441074994143404