# Calculamos 3 variables refinadas a partir de las anteriores:
- store_weekly_sales
- date_independent_store_weekly_sales
- efficiency_index

In [1]:
import os
import pandas as pd

# Ruta al fichero de datos.
path_to_file = os.path.join('.','src','train.csv')
data = pd.read_csv(path_to_file)

In [2]:
selected_data = data[['Date','Store','Weekly_Sales', 'Size', 'CPI']]
selected_data.head()

Unnamed: 0,Date,Store,Weekly_Sales,Size,CPI
0,2011-08-26,26,87235.57,152513,136.213613
1,2011-03-25,34,5945.97,158114,128.616064
2,2010-12-03,21,1219.89,140167,211.265543
3,2010-09-17,8,11972.71,155078,214.878556
4,2012-05-18,19,8271.82,203819,138.106581


In [3]:
# Base Df at Store-Date level:
store_sales_per_date = selected_data.groupby(['Date','Store'])['Weekly_Sales'].sum().reset_index()
store_obj = selected_data[['Store', 'Size']].drop_duplicates()
date_obj = selected_data[['Date', 'CPI']].drop_duplicates().groupby(['Date'])['CPI'].mean()

date_store_obj = pd.merge(store_sales_per_date, date_obj, on='Date', how='inner')
base = pd.merge(date_store_obj, store_obj, on='Store', how='inner')
base = base.sort_values(['Date', 'Store']).reset_index()

# store_weekly_sales
# Sales per week per store
final_data = base.rename(columns={'Weekly_Sales':'store_weekly_sales'}, inplace=False)
final_data.head()

Unnamed: 0,index,Date,Store,store_weekly_sales,CPI,Size
0,0,2010-02-05,1,1112466.82,191.427789,151315
1,143,2010-02-05,2,1506524.45,191.427789,202307
2,286,2010-02-05,3,358646.22,191.427789,37392
3,429,2010-02-05,4,1090558.09,191.427789,205863
4,572,2010-02-05,5,187551.77,191.427789,34875


In [4]:
# Date independent Store weekly sales.
# Measures the weekly sales of a store taking into account the increase in price. This is a better measurement of successful sales than the normal one.
# it is independent of the date.
final_data['date_independent_store_weekly_sales'] = final_data['store_weekly_sales'] / final_data['CPI']
final_data[final_data['Store'] == 2].head()

Unnamed: 0,index,Date,Store,store_weekly_sales,CPI,Size,date_independent_store_weekly_sales
1,143,2010-02-05,2,1506524.45,191.427789,202307,7869.936029
46,144,2010-02-12,2,1352217.18,191.549145,202307,7059.374665
91,145,2010-02-19,2,1521667.62,191.59718,202307,7942.01468
136,146,2010-02-26,2,1142608.45,191.632996,202307,5962.48284
181,147,2010-03-05,2,1245452.05,191.668811,202307,6497.937981


In [5]:
# Efficiency Index:
# Measure of how well a store is performing based on it's size. The greatter the size, the greater the potential of selling.
# This measurement tries to scale the date_independent_store_weekly_sales to all of the stores independently to their size.
final_data['efficiency_index'] = final_data['date_independent_store_weekly_sales'] / final_data['Size'] * 100
final_data.describe()

Unnamed: 0,index,Store,store_weekly_sales,CPI,Size,date_independent_store_weekly_sales,efficiency_index
count,6435.0,6435.0,6435.0,6435.0,6435.0,6435.0,6435.0
mean,3217.0,23.0,701559.5,195.897005,130287.6,3582.395988,2.905107
std,1857.768823,12.988182,391594.2,3.765544,63117.022465,2001.355189,1.070277
min,0.0,1.0,68982.11,190.988178,34875.0,344.177716,0.77857
25%,1608.5,12.0,375613.9,192.023334,70713.0,1915.333454,2.129661
50%,3217.0,23.0,639652.4,195.483004,126512.0,3265.261889,2.690708
75%,4825.5,34.0,958807.4,199.875,202307.0,4905.295148,3.482841
max,6434.0,45.0,2773216.0,202.273668,219622.0,14444.002116,10.807775


In [9]:
final_data = final_data[['Date','Store', 'store_weekly_sales', 'date_independent_store_weekly_sales', 'efficiency_index', 'CPI', 'Size']]
final_data.head()

Unnamed: 0,Date,Store,store_weekly_sales,date_independent_store_weekly_sales,efficiency_index,CPI,Size
0,2010-02-05,1,1112466.82,5811.417603,3.840609,191.427789,151315
1,2010-02-05,2,1506524.45,7869.936029,3.890096,191.427789,202307
2,2010-02-05,3,358646.22,1873.532692,5.010517,191.427789,37392
3,2010-02-05,4,1090558.09,5696.968545,2.767359,191.427789,205863
4,2010-02-05,5,187551.77,979.752059,2.809325,191.427789,34875


In [7]:
# save final data to file so that we can export it to Tableau:
path_to_output_file = os.path.join('.','src','pre_processed_output.csv')
final_data.to_csv(path_to_output_file, index=False)