In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import matplotlib.ticker as ticker
import matplotlib.dates as mdates

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn import preprocessing

pd.set_option("display.max_columns", 101)

In [2]:
# Credit: The reduce_mem_usage function has been taken from 
# https://gist.github.com/tkazusa/4d9e26d403c73755edc6b77b5b053a43
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object and str(col_type)!= 'category':
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [3]:
sales_train_validation_data = pd.read_csv('data/sales_train_validation.csv')
sales_train_validation_data = reduce_mem_usage(sales_train_validation_data)
calendar_data = pd.read_csv('data/calendar.csv')
calendar_data = reduce_mem_usage(calendar_data)
prices_data = pd.read_csv('data/sell_prices.csv')
prices_data = reduce_mem_usage(prices_data)

Memory usage of dataframe is 446.40 MB
Memory usage after optimization is: 95.42 MB
Decreased by 78.6%
Memory usage of dataframe is 0.21 MB
Memory usage after optimization is: 0.22 MB
Decreased by -5.7%
Memory usage of dataframe is 208.77 MB
Memory usage after optimization is: 45.77 MB
Decreased by 78.1%


In [4]:
d_colnames = []
for i in range(1, 1914):
    d_colnames.append('d_'+str(i))
sales_train_validation_data_new = pd.melt(sales_train_validation_data, \
        id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
        value_vars=d_colnames, var_name='day_number', value_name='units_sold')
sales_train_validation_data_new['day_number'] = \
    sales_train_validation_data_new['day_number'].apply(lambda x: int(x[2:]))
sales_train_validation_data_new = reduce_mem_usage(sales_train_validation_data_new)
sales_train_validation_data_new

Memory usage of dataframe is 1002.84 MB
Memory usage after optimization is: 669.09 MB
Decreased by 33.3%


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,day_number,units_sold
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,1,0
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,1,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,1,0
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,1,0
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,1,0
5,HOBBIES_1_006_CA_1_validation,HOBBIES_1_006,HOBBIES_1,HOBBIES,CA_1,CA,1,0
6,HOBBIES_1_007_CA_1_validation,HOBBIES_1_007,HOBBIES_1,HOBBIES,CA_1,CA,1,0
7,HOBBIES_1_008_CA_1_validation,HOBBIES_1_008,HOBBIES_1,HOBBIES,CA_1,CA,1,12
8,HOBBIES_1_009_CA_1_validation,HOBBIES_1_009,HOBBIES_1,HOBBIES,CA_1,CA,1,2
9,HOBBIES_1_010_CA_1_validation,HOBBIES_1_010,HOBBIES_1,HOBBIES,CA_1,CA,1,0


In [5]:
calendar_data['d'] = \
    calendar_data['d'].apply(lambda x: int(x[2:]))
merged_df = sales_train_validation_data_new.merge(calendar_data, left_on='day_number', right_on='d', how='left')
merged_df = reduce_mem_usage(merged_df)
del calendar_data
del sales_train_validation_data
del sales_train_validation_data_new
gc.collect() # Garbage collector

Memory usage of dataframe is 2449.29 MB
Memory usage after optimization is: 2115.63 MB
Decreased by 13.6%


62

In [6]:
df = merged_df.merge(
    prices_data, left_on=['store_id', 'item_id', 'wm_yr_wk'],
    right_on=['store_id', 'item_id', 'wm_yr_wk'],
    how='left')
df = reduce_mem_usage(df)

Memory usage of dataframe is 2226.88 MB
Memory usage after optimization is: 2226.88 MB
Decreased by 0.0%


In [7]:
del merged_df
del prices_data
gc.collect()

89

In [8]:
df.drop(['wday', 'd', 'wm_yr_wk'], inplace=True, axis=1)
df['date'] = pd.to_datetime(df['date'])
df['week'] = df['date'].dt.week
df.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,day_number,units_sold,date,weekday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,week
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,1,0,2011-01-29,Saturday,1,2011,,,,,0,0,0,,4
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,1,0,2011-01-29,Saturday,1,2011,,,,,0,0,0,,4
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,1,0,2011-01-29,Saturday,1,2011,,,,,0,0,0,,4
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,1,0,2011-01-29,Saturday,1,2011,,,,,0,0,0,,4
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,1,0,2011-01-29,Saturday,1,2011,,,,,0,0,0,,4


In [9]:
df.to_pickle("./data.pkl")

In [3]:
# Testing if pickle (serializing) the dataframe to file has worked
unpickled_df = pd.read_pickle("./data.pkl")
unpickled_df

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,day_number,units_sold,date,weekday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,week
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,1,0,2011-01-29,Saturday,1,2011,,,,,0,0,0,,4
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,1,0,2011-01-29,Saturday,1,2011,,,,,0,0,0,,4
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,1,0,2011-01-29,Saturday,1,2011,,,,,0,0,0,,4
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,1,0,2011-01-29,Saturday,1,2011,,,,,0,0,0,,4
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,1,0,2011-01-29,Saturday,1,2011,,,,,0,0,0,,4
5,HOBBIES_1_006_CA_1_validation,HOBBIES_1_006,HOBBIES_1,HOBBIES,CA_1,CA,1,0,2011-01-29,Saturday,1,2011,,,,,0,0,0,,4
6,HOBBIES_1_007_CA_1_validation,HOBBIES_1_007,HOBBIES_1,HOBBIES,CA_1,CA,1,0,2011-01-29,Saturday,1,2011,,,,,0,0,0,,4
7,HOBBIES_1_008_CA_1_validation,HOBBIES_1_008,HOBBIES_1,HOBBIES,CA_1,CA,1,12,2011-01-29,Saturday,1,2011,,,,,0,0,0,0.459961,4
8,HOBBIES_1_009_CA_1_validation,HOBBIES_1_009,HOBBIES_1,HOBBIES,CA_1,CA,1,2,2011-01-29,Saturday,1,2011,,,,,0,0,0,1.559570,4
9,HOBBIES_1_010_CA_1_validation,HOBBIES_1_010,HOBBIES_1,HOBBIES,CA_1,CA,1,0,2011-01-29,Saturday,1,2011,,,,,0,0,0,3.169922,4


In [5]:
first_product = unpickled_df.loc[unpickled_df.item_id == 'HOBBIES_1_001']
first_product

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,day_number,units_sold,date,weekday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,week
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,1,0,2011-01-29,Saturday,1,2011,,,,,0,0,0,,4
3049,HOBBIES_1_001_CA_2_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_2,CA,1,0,2011-01-29,Saturday,1,2011,,,,,0,0,0,,4
6098,HOBBIES_1_001_CA_3_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_3,CA,1,0,2011-01-29,Saturday,1,2011,,,,,0,0,0,,4
9147,HOBBIES_1_001_CA_4_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_4,CA,1,0,2011-01-29,Saturday,1,2011,,,,,0,0,0,,4
12196,HOBBIES_1_001_TX_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,TX_1,TX,1,0,2011-01-29,Saturday,1,2011,,,,,0,0,0,,4
15245,HOBBIES_1_001_TX_2_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,TX_2,TX,1,0,2011-01-29,Saturday,1,2011,,,,,0,0,0,,4
18294,HOBBIES_1_001_TX_3_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,TX_3,TX,1,0,2011-01-29,Saturday,1,2011,,,,,0,0,0,,4
21343,HOBBIES_1_001_WI_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,WI_1,WI,1,0,2011-01-29,Saturday,1,2011,,,,,0,0,0,,4
24392,HOBBIES_1_001_WI_2_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,WI_2,WI,1,0,2011-01-29,Saturday,1,2011,,,,,0,0,0,,4
27441,HOBBIES_1_001_WI_3_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,WI_3,WI,1,0,2011-01-29,Saturday,1,2011,,,,,0,0,0,,4


In [6]:
# No of valdation points in first product
print(len(first_product[first_product.id.str.contains('validation')]))
# No of valdation points for the whole df
print(len(unpickled_df[unpickled_df.id.str.contains('validation')]))

19130
58327370


In [7]:
unpickled_df[unpickled_df.id.str.contains('training')]

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,day_number,units_sold,date,weekday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,week


In [15]:
def snap(row):
    if row['snap_'+ row.state_id] == 1:
        return 1
    return 0

first_product['snap'] = first_product.apply(snap, axis=1)
first_product.drop(['snap_CA', 'snap_WI', 'snap_TX'], axis=1, inplace=True)
first_product

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,day_number,units_sold,date,weekday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,sell_price,week,snap
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,1,0,2011-01-29,Saturday,1,2011,,,,,,4,0
3049,HOBBIES_1_001_CA_2_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_2,CA,1,0,2011-01-29,Saturday,1,2011,,,,,,4,0
6098,HOBBIES_1_001_CA_3_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_3,CA,1,0,2011-01-29,Saturday,1,2011,,,,,,4,0
9147,HOBBIES_1_001_CA_4_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_4,CA,1,0,2011-01-29,Saturday,1,2011,,,,,,4,0
12196,HOBBIES_1_001_TX_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,TX_1,TX,1,0,2011-01-29,Saturday,1,2011,,,,,,4,0
15245,HOBBIES_1_001_TX_2_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,TX_2,TX,1,0,2011-01-29,Saturday,1,2011,,,,,,4,0
18294,HOBBIES_1_001_TX_3_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,TX_3,TX,1,0,2011-01-29,Saturday,1,2011,,,,,,4,0
21343,HOBBIES_1_001_WI_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,WI_1,WI,1,0,2011-01-29,Saturday,1,2011,,,,,,4,0
24392,HOBBIES_1_001_WI_2_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,WI_2,WI,1,0,2011-01-29,Saturday,1,2011,,,,,,4,0
27441,HOBBIES_1_001_WI_3_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,WI_3,WI,1,0,2011-01-29,Saturday,1,2011,,,,,,4,0
