In [2]:
import numpy as np
import pandas as pd
import feather
from tqdm import tqdm
from joblib import Parallel, delayed

In [3]:
df_test = pd.read_csv('../data/test.csv')

In [4]:
df_test.shape

(3370464, 5)

In [5]:
df_test.head()

Unnamed: 0,id,date,store_nbr,item_nbr,onpromotion
0,125497040,2017-08-16,1,96995,False
1,125497041,2017-08-16,1,99197,False
2,125497042,2017-08-16,1,103501,False
3,125497043,2017-08-16,1,103520,False
4,125497044,2017-08-16,1,103665,False


In [6]:
len(np.unique(df_test.store_nbr))

54

In [7]:
len(np.unique(df_test.item_nbr))

3901

In [8]:
np.min(df_test.date)

'2017-08-16'

In [9]:
np.max(df_test.date)

'2017-08-31'

In [10]:
df_items = pd.read_csv('../data/items.csv')

In [11]:
df_items.shape

(4100, 4)

In [12]:
df_items.head()

Unnamed: 0,item_nbr,family,class,perishable
0,96995,GROCERY I,1093,0
1,99197,GROCERY I,1067,0
2,103501,CLEANING,3008,0
3,103520,GROCERY I,1028,0
4,103665,BREAD/BAKERY,2712,1


In [13]:
len(np.unique(df_items.item_nbr))

4100

In [14]:
pd.value_counts(df_items.family)

GROCERY I                     1334
BEVERAGES                      613
CLEANING                       446
PRODUCE                        306
DAIRY                          242
PERSONAL CARE                  153
BREAD/BAKERY                   134
HOME CARE                      108
DELI                            91
MEATS                           84
HOME AND KITCHEN I              77
LIQUOR,WINE,BEER                73
FROZEN FOODS                    55
POULTRY                         54
HOME AND KITCHEN II             45
EGGS                            41
CELEBRATION                     31
PREPARED FOODS                  26
LAWN AND GARDEN                 26
LADIESWEAR                      21
LINGERIE                        20
AUTOMOTIVE                      20
BEAUTY                          19
PLAYERS AND ELECTRONICS         17
SCHOOL AND OFFICE SUPPLIES      15
GROCERY II                      14
PET SUPPLIES                    14
SEAFOOD                          8
MAGAZINES           

In [15]:
len(np.unique(df_items['class']))

337

In [16]:
pd.value_counts(df_items.perishable)

0    3114
1     986
Name: perishable, dtype: int64

In [17]:
len(set(df_test.item_nbr) - set(df_items.item_nbr))

0

In [18]:
len(set(df_items.item_nbr) - set(df_test.item_nbr))

199

In [19]:
list(set(df_items.item_nbr) - set(df_test.item_nbr))

[1463810,
 1463814,
 1422347,
 1449487,
 1463823,
 1084436,
 1935388,
 1036317,
 766493,
 1328672,
 864290,
 1920035,
 1397797,
 1911336,
 2134058,
 1397802,
 1456171,
 1158705,
 958514,
 265266,
 122418,
 1924661,
 1352758,
 1950263,
 1924662,
 1948220,
 1929795,
 1695813,
 1929797,
 1997895,
 1964111,
 1463896,
 1418842,
 871514,
 1463900,
 1921118,
 1229919,
 414305,
 639586,
 874593,
 856687,
 1990768,
 1148017,
 1148018,
 856688,
 1980532,
 603255,
 1974921,
 2048139,
 1360013,
 1359503,
 1988754,
 730259,
 354964,
 1074327,
 2015898,
 1458842,
 1458844,
 1918621,
 716958,
 1243803,
 1047709,
 376483,
 1243817,
 426155,
 1455790,
 1949872,
 679604,
 1210038,
 2011329,
 675524,
 1950405,
 1939144,
 813769,
 1467082,
 813770,
 1166036,
 847575,
 673496,
 1464027,
 819933,
 819934,
 1173213,
 1915102,
 1990881,
 269029,
 1239789,
 1239790,
 1239791,
 1239792,
 2075374,
 410866,
 1239794,
 1418484,
 1239796,
 1239798,
 1239800,
 2048248,
 1239806,
 1998078,
 1320708,
 1239813,
 123982

In [20]:
not_in_test_item_ids = list(set(df_items.item_nbr) - set(df_test.item_nbr))

In [21]:
df_items1 = df_items[~df_items.item_nbr.isin(not_in_test_item_ids)]

In [22]:
df_items1.shape

(3901, 4)

In [23]:
df_test.isnull().sum()

id             0
date           0
store_nbr      0
item_nbr       0
onpromotion    0
dtype: int64

In [24]:
pd.value_counts(df_test.onpromotion)

False    3171867
True      198597
Name: onpromotion, dtype: int64

In [25]:
df_items.isnull().sum()

item_nbr      0
family        0
class         0
perishable    0
dtype: int64

In [26]:
df_test1 = pd.merge(df_test, df_items1, how='left', on='item_nbr')

In [27]:
df_test1.shape

(3370464, 8)

In [28]:
df_test1.isnull().sum()

id             0
date           0
store_nbr      0
item_nbr       0
onpromotion    0
family         0
class          0
perishable     0
dtype: int64

In [29]:
df_test1.head()

Unnamed: 0,id,date,store_nbr,item_nbr,onpromotion,family,class,perishable
0,125497040,2017-08-16,1,96995,False,GROCERY I,1093,0
1,125497041,2017-08-16,1,99197,False,GROCERY I,1067,0
2,125497042,2017-08-16,1,103501,False,CLEANING,3008,0
3,125497043,2017-08-16,1,103520,False,GROCERY I,1028,0
4,125497044,2017-08-16,1,103665,False,BREAD/BAKERY,2712,1


In [30]:
df_stores = pd.read_csv('../data/stores.csv')

In [31]:
df_stores.shape

(54, 5)

In [32]:
df_stores.head()

Unnamed: 0,store_nbr,city,state,store_type,cluster
0,1,Quito,Pichincha,D,13
1,2,Quito,Pichincha,D,13
2,3,Quito,Pichincha,D,8
3,4,Quito,Pichincha,D,9
4,5,Santo Domingo,Santo Domingo de los Tsachilas,D,4


In [33]:
len(set(df_test1.store_nbr))

54

In [34]:
pd.value_counts(df_stores.city)

Quito            18
Guayaquil         8
Santo Domingo     3
Cuenca            3
Latacunga         2
Manta             2
Machala           2
Ambato            2
Esmeraldas        1
Daule             1
Libertad          1
Guaranda          1
Ibarra            1
Loja              1
Babahoyo          1
Quevedo           1
Salinas           1
Puyo              1
El Carmen         1
Cayambe           1
Playas            1
Riobamba          1
Name: city, dtype: int64

In [35]:
pd.value_counts(df_stores.state)

Pichincha                         19
Guayas                            11
Santo Domingo de los Tsachilas     3
Manabi                             3
Azuay                              3
El Oro                             2
Cotopaxi                           2
Tungurahua                         2
Los Rios                           2
Loja                               1
Imbabura                           1
Chimborazo                         1
Santa Elena                        1
Esmeraldas                         1
Pastaza                            1
Bolivar                            1
Name: state, dtype: int64

In [36]:
pd.value_counts(df_stores.store_type)

D    18
C    15
A     9
B     8
E     4
Name: store_type, dtype: int64

In [37]:
pd.value_counts(df_stores.cluster)

3     7
6     6
10    6
15    5
14    4
13    4
1     3
11    3
8     3
4     3
2     2
9     2
7     2
5     1
16    1
12    1
17    1
Name: cluster, dtype: int64

In [38]:
df_stores.isnull().sum()

store_nbr     0
city          0
state         0
store_type    0
cluster       0
dtype: int64

In [39]:
df_test2 = pd.merge(df_test1, df_stores, how='left', on='store_nbr')

In [41]:
df_test2.shape

(3370464, 12)

In [42]:
df_test2.isnull().sum()

id             0
date           0
store_nbr      0
item_nbr       0
onpromotion    0
family         0
class          0
perishable     0
city           0
state          0
store_type     0
cluster        0
dtype: int64

In [43]:
df_test2.head()

Unnamed: 0,id,date,store_nbr,item_nbr,onpromotion,family,class,perishable,city,state,store_type,cluster
0,125497040,2017-08-16,1,96995,False,GROCERY I,1093,0,Quito,Pichincha,D,13
1,125497041,2017-08-16,1,99197,False,GROCERY I,1067,0,Quito,Pichincha,D,13
2,125497042,2017-08-16,1,103501,False,CLEANING,3008,0,Quito,Pichincha,D,13
3,125497043,2017-08-16,1,103520,False,GROCERY I,1028,0,Quito,Pichincha,D,13
4,125497044,2017-08-16,1,103665,False,BREAD/BAKERY,2712,1,Quito,Pichincha,D,13


In [44]:
df_oil = pd.read_csv('../data/oil.csv')

In [45]:
df_oil.shape

(1218, 2)

In [46]:
df_oil.head()

Unnamed: 0,date,dcoilwtico
0,2013-01-01,
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.2


In [47]:
df_oil.isnull().sum()

date           0
dcoilwtico    43
dtype: int64

In [48]:
np.min(df_oil.dcoilwtico)

26.190000000000001

In [49]:
np.max(df_oil.dcoilwtico)

110.62

In [50]:
np.median(df_oil.dcoilwtico)

  r = func(a, **kwargs)


nan

In [51]:
df_oil_nan = df_oil[df_oil.dcoilwtico.isnull()]

In [52]:
df_oil_nan.shape

(43, 2)

In [53]:
df_oil_nan

Unnamed: 0,date,dcoilwtico
0,2013-01-01,
14,2013-01-21,
34,2013-02-18,
63,2013-03-29,
104,2013-05-27,
132,2013-07-04,
174,2013-09-02,
237,2013-11-28,
256,2013-12-25,
261,2014-01-01,


In [54]:
dfg = df_oil.groupby('dcoilwtico')

In [55]:
dfg.head()

Unnamed: 0,date,dcoilwtico
0,2013-01-01,
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.20
5,2013-01-08,93.21
6,2013-01-09,93.08
7,2013-01-10,93.81
8,2013-01-11,93.60
9,2013-01-14,94.27


In [56]:
df_test2.dtypes

id              int64
date           object
store_nbr       int64
item_nbr        int64
onpromotion      bool
family         object
class           int64
perishable      int64
city           object
state          object
store_type     object
cluster         int64
dtype: object

In [57]:
df_oil.shape

(1218, 2)

In [59]:
np.datetime64(np.max(df_test2.date)) - np.datetime64(np.min(df_test2.date))

numpy.timedelta64(15,'D')

In [60]:
np.datetime64(np.max(df_oil.date)) - np.datetime64(np.min(df_oil.date))

numpy.timedelta64(1703,'D')

In [61]:
df_oil.head()

Unnamed: 0,date,dcoilwtico
0,2013-01-01,
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.2


In [62]:
df_oil.iloc[0]['dcoilwtico'] = 93.14

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [63]:
df_oil.head(10)

Unnamed: 0,date,dcoilwtico
0,2013-01-01,
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.2
5,2013-01-08,93.21
6,2013-01-09,93.08
7,2013-01-10,93.81
8,2013-01-11,93.6
9,2013-01-14,94.27


In [64]:
df_oil.loc[df_oil['date'] == '2013-01-01', 'dcoilwtico'] = 93.14

In [65]:
df_oil.head()

Unnamed: 0,date,dcoilwtico
0,2013-01-01,93.14
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.2


In [66]:
df_oil[df_oil['date'] == '2013-01-01']['dcoilwtico']

0    93.14
Name: dcoilwtico, dtype: float64

In [67]:
np.isnan(df_oil[df_oil['date'] == '2013-01-21']['dcoilwtico'])

14    True
Name: dcoilwtico, dtype: bool

In [68]:
print(df_oil[df_oil['date'] == '2017-06-30']['dcoilwtico'])
print(df_oil[df_oil['date'] == '2017-07-01']['dcoilwtico'])
print(df_oil[df_oil['date'] == '2017-07-02']['dcoilwtico'])
print(df_oil[df_oil['date'] == '2017-07-03']['dcoilwtico'])
print(df_oil[df_oil['date'] == '2017-07-04']['dcoilwtico'])
print(df_oil[df_oil['date'] == '2017-07-05']['dcoilwtico'])

1173    46.02
Name: dcoilwtico, dtype: float64
Series([], Name: dcoilwtico, dtype: float64)
Series([], Name: dcoilwtico, dtype: float64)
1174   NaN
Name: dcoilwtico, dtype: float64
1175   NaN
Name: dcoilwtico, dtype: float64
1176    45.11
Name: dcoilwtico, dtype: float64


In [69]:
(46.02+45.11)/2.

45.565

In [70]:
df_oil.loc[df_oil['date'] == '2017-07-03', 'dcoilwtico'] = 45.565
df_oil.loc[df_oil['date'] == '2017-07-04', 'dcoilwtico'] = 45.565

In [71]:
print(df_oil[df_oil['date'] == '2017-06-30']['dcoilwtico'])
print(df_oil[df_oil['date'] == '2017-07-01']['dcoilwtico'])
print(df_oil[df_oil['date'] == '2017-07-02']['dcoilwtico'])
print(df_oil[df_oil['date'] == '2017-07-03']['dcoilwtico'])
print(df_oil[df_oil['date'] == '2017-07-04']['dcoilwtico'])
print(df_oil[df_oil['date'] == '2017-07-05']['dcoilwtico'])

1173    46.02
Name: dcoilwtico, dtype: float64
Series([], Name: dcoilwtico, dtype: float64)
Series([], Name: dcoilwtico, dtype: float64)
1174    45.565
Name: dcoilwtico, dtype: float64
1175    45.565
Name: dcoilwtico, dtype: float64
1176    45.11
Name: dcoilwtico, dtype: float64


In [72]:
df_oil.head()

Unnamed: 0,date,dcoilwtico
0,2013-01-01,93.14
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.2


In [73]:
pd.isnull(df_oil[df_oil['date'] == '2013-01-21']['dcoilwtico'])

14    True
Name: dcoilwtico, dtype: bool

In [74]:
pd.isnull(df_oil[df_oil['date'] == '2013-01-01']['dcoilwtico'])

0    False
Name: dcoilwtico, dtype: bool

In [75]:
pd.isnull(df_oil[df_oil['date'] == '2016-12-26']['dcoilwtico'])

1039    True
Name: dcoilwtico, dtype: bool

In [76]:
df_oil[df_oil['date'] == '2016-11-24']

Unnamed: 0,date,dcoilwtico
1017,2016-11-24,


In [77]:
df_oil1 = df_oil.set_index('date')

In [78]:
df_oil1.head()

Unnamed: 0_level_0,dcoilwtico
date,Unnamed: 1_level_1
2013-01-01,93.14
2013-01-02,93.14
2013-01-03,92.97
2013-01-04,93.12
2013-01-07,93.2


In [79]:
df_oil1.loc['2013-01-01', 'dcoilwtico']

93.140000000000001

In [80]:
df_oil1.loc['2013-01-21', 'dcoilwtico']

nan

In [81]:
df_oil.iloc[0]['dcoilwtico']

93.140000000000001

In [82]:
df_oil.shape

(1218, 2)

In [83]:
np.isnan(df_oil1.loc['2013-01-21', 'dcoilwtico'])

True

In [84]:
# just set the oil price to prev ay price
# earlier thought of setting it to average of prev day and next day price
for i, date in enumerate(df_oil.date.values):
    next_day_price=0
    prev_day_price=0
    if i==0:
        continue
    prev_day_price = df_oil.iloc[i-1]['dcoilwtico']
    if(i < df_oil.shape[0]-1):
        next_day_price = df_oil.iloc[i-1]['dcoilwtico']
    else:
        continue
    
    curr_day_price = (prev_day_price + next_day_price)/2.0
    if np.isnan(df_oil1.loc[date, 'dcoilwtico']):
        df_oil.loc[df_oil['date'] == date, 'dcoilwtico'] = curr_day_price
    

In [85]:
df_oil.isnull().sum()

date          0
dcoilwtico    0
dtype: int64

In [87]:
df_oil.head()

Unnamed: 0,date,dcoilwtico
0,2013-01-01,93.14
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.2


In [89]:
set(df_oil.date) - set(df_test2.date) # dates in df_oil not in df_test2

{'2015-03-04',
 '2015-04-22',
 '2013-03-11',
 '2013-04-25',
 '2013-08-27',
 '2016-04-06',
 '2016-08-25',
 '2016-10-27',
 '2016-06-28',
 '2014-09-01',
 '2016-04-07',
 '2014-04-18',
 '2014-04-22',
 '2015-12-10',
 '2013-05-28',
 '2013-06-07',
 '2017-06-29',
 '2015-11-04',
 '2015-07-03',
 '2013-04-05',
 '2016-12-15',
 '2013-11-07',
 '2014-05-29',
 '2016-01-01',
 '2015-05-26',
 '2013-01-07',
 '2013-09-02',
 '2014-01-20',
 '2014-09-15',
 '2017-03-23',
 '2014-02-21',
 '2017-04-07',
 '2015-06-18',
 '2013-07-17',
 '2015-07-28',
 '2013-01-28',
 '2015-09-18',
 '2013-09-24',
 '2016-11-10',
 '2017-04-04',
 '2015-06-22',
 '2015-08-28',
 '2017-01-13',
 '2016-07-20',
 '2015-10-07',
 '2017-02-21',
 '2015-05-22',
 '2013-12-23',
 '2015-10-27',
 '2016-10-19',
 '2013-05-27',
 '2014-10-03',
 '2016-02-02',
 '2016-03-30',
 '2014-04-16',
 '2013-03-15',
 '2015-07-15',
 '2017-05-19',
 '2017-07-31',
 '2014-07-03',
 '2013-04-03',
 '2017-08-14',
 '2013-06-24',
 '2013-10-30',
 '2017-07-07',
 '2017-02-20',
 '2014-07-

In [120]:
l = list(set(df_oil.date) - set(df_test2.date))

In [121]:
df_oil.shape

(1218, 2)

In [122]:
df_oil2 = df_oil[~df_oil.date.isin(l)] # dates in df_test and df_oil2 now match

In [93]:
df_oil2.shape

(12, 2)

In [94]:
len(np.unique(df_test2.date))

16

In [95]:
len(set(df_test2.date))

16

In [96]:
set(df_test2.date) - set(df_oil.date)

{'2017-08-19', '2017-08-20', '2017-08-26', '2017-08-27'}

In [97]:
df = pd.DataFrame(list(set(df_test2.date) - set(df_oil.date)))

In [98]:
df.columns = ['date']

In [99]:
df['dcoilwtico'] = 0.0

In [100]:
df.head()

Unnamed: 0,date,dcoilwtico
0,2017-08-19,0.0
1,2017-08-26,0.0
2,2017-08-20,0.0
3,2017-08-27,0.0


In [101]:
df.shape

(4, 2)

In [102]:
def nearest(items, pivot):
    return min(items, key=lambda x: abs(x - pivot))

In [103]:
import datetime

In [104]:
df_oil2_dates = df_oil2.date.values

In [105]:
print(df_oil2_dates)

['2017-08-16' '2017-08-17' '2017-08-18' '2017-08-21' '2017-08-22'
 '2017-08-23' '2017-08-24' '2017-08-25' '2017-08-28' '2017-08-29'
 '2017-08-30' '2017-08-31']


In [114]:
df_oil2.loc[df_oil2['date'] == '2017-08-16', 'dcoilwtico']

1206    46.8
Name: dcoilwtico, dtype: float64

In [116]:
df_oil2.shape

(12, 2)

In [117]:
df_oil2 = df_oil2.reset_index(drop=True)

In [118]:
df_oil2.head()

Unnamed: 0,date,dcoilwtico
0,2017-08-16,46.8
1,2017-08-17,47.07
2,2017-08-18,48.59
3,2017-08-21,47.39
4,2017-08-22,47.65


In [119]:
df_oil2.isnull().sum()

date          0
dcoilwtico    0
dtype: int64

In [123]:
for i, date in enumerate(df.date.values):
    print(date)
    for j in range(1,5): # pick the last price upto 5 days back
        new_date = str(np.datetime64(date) - np.timedelta64(j, 'D'))
        if new_date in df_oil.date.values:
            print(new_date)
            last_price = df_oil2.loc[df_oil2['date'] == new_date, 'dcoilwtico']
            print(last_price.item())
            df.loc[df['date'] == date, 'dcoilwtico'] = last_price.item()
            break
        else:
            continue
        print('\n')

2017-08-19
2017-08-18
48.59
2017-08-26
2017-08-25
47.65
2017-08-20
2017-08-18
48.59
2017-08-27
2017-08-25
47.65


In [124]:
df.head()

Unnamed: 0,date,dcoilwtico
0,2017-08-19,48.59
1,2017-08-26,47.65
2,2017-08-20,48.59
3,2017-08-27,47.65


In [125]:
df.shape

(4, 2)

In [126]:
df_oil2.shape

(12, 2)

In [127]:
len(set(df_test2.date))

16

In [128]:
df_oil3 = pd.concat([df_oil2, df], axis=0)

In [129]:
df_oil3.isnull().sum()

date          0
dcoilwtico    0
dtype: int64

In [130]:
set(df_test2.date) - set(df_oil3.date)

set()

In [131]:
df_oil3.shape

(16, 2)

In [132]:
df_test3 = pd.merge(df_test2, df_oil3, how='left', on='date')

In [133]:
df_test3.shape

(3370464, 13)

In [134]:
df_test3.head()

Unnamed: 0,id,date,store_nbr,item_nbr,onpromotion,family,class,perishable,city,state,store_type,cluster,dcoilwtico
0,125497040,2017-08-16,1,96995,False,GROCERY I,1093,0,Quito,Pichincha,D,13,46.8
1,125497041,2017-08-16,1,99197,False,GROCERY I,1067,0,Quito,Pichincha,D,13,46.8
2,125497042,2017-08-16,1,103501,False,CLEANING,3008,0,Quito,Pichincha,D,13,46.8
3,125497043,2017-08-16,1,103520,False,GROCERY I,1028,0,Quito,Pichincha,D,13,46.8
4,125497044,2017-08-16,1,103665,False,BREAD/BAKERY,2712,1,Quito,Pichincha,D,13,46.8


In [135]:
df_test3.isnull().sum()

id             0
date           0
store_nbr      0
item_nbr       0
onpromotion    0
family         0
class          0
perishable     0
city           0
state          0
store_type     0
cluster        0
dcoilwtico     0
dtype: int64

In [136]:
df_test3['onpromotion'] = df_test3['onpromotion'].astype('str')

In [137]:
# save the file as each step
feather.write_dataframe(df_test3, '../cache/test3_t.feather')

In [141]:
df_t = pd.read_csv('../cache/test_transactions.csv')

In [142]:
df_t.head()

Unnamed: 0,date,transactions,store_nbr
0,2017-08-16,1538,1
1,2017-08-17,1745,1
2,2017-08-18,1653,1
3,2017-08-19,1687,1
4,2017-08-20,1397,1


In [143]:
df_t.shape

(864, 3)

In [144]:
len(set(df_t.store_nbr))

54

In [145]:
set(df_t.store_nbr) - set(df_test3.store_nbr)

set()

In [146]:
set(df_test3.store_nbr) - set(df_t.store_nbr)

set()

In [147]:
set(df_test3.date) - set(df_t.date)

set()

In [148]:
df_t.isnull().sum()

date            0
transactions    0
store_nbr       0
dtype: int64

In [149]:
df_test3['ds'] = df_test3['date'].astype('str') + '_' + df_test3['store_nbr'].astype('str')

In [150]:
df_t['ds'] = df_t['date'].astype('str') + '_' + df_t['store_nbr'].astype('str')

In [151]:
df_t1 = df_t

In [152]:
df_t1 = df_t1.drop('date', axis=1)
df_t1 = df_t1.drop('store_nbr', axis=1)

In [153]:
df_t1.head()

Unnamed: 0,transactions,ds
0,1538,2017-08-16_1
1,1745,2017-08-17_1
2,1653,2017-08-18_1
3,1687,2017-08-19_1
4,1397,2017-08-20_1


In [154]:
df_t1.shape

(864, 2)

In [156]:
df_t1.loc[863]

transactions              763
ds              2017-08-31_54
Name: 863, dtype: object

In [157]:
df_t1.tail()

Unnamed: 0,transactions,ds
859,870,2017-08-27_54
860,1012,2017-08-28_54
861,858,2017-08-29_54
862,788,2017-08-30_54
863,763,2017-08-31_54


In [158]:
df_t1.shape

(864, 2)

In [159]:
df_test3.head()

Unnamed: 0,id,date,store_nbr,item_nbr,onpromotion,family,class,perishable,city,state,store_type,cluster,dcoilwtico,ds
0,125497040,2017-08-16,1,96995,False,GROCERY I,1093,0,Quito,Pichincha,D,13,46.8,2017-08-16_1
1,125497041,2017-08-16,1,99197,False,GROCERY I,1067,0,Quito,Pichincha,D,13,46.8,2017-08-16_1
2,125497042,2017-08-16,1,103501,False,CLEANING,3008,0,Quito,Pichincha,D,13,46.8,2017-08-16_1
3,125497043,2017-08-16,1,103520,False,GROCERY I,1028,0,Quito,Pichincha,D,13,46.8,2017-08-16_1
4,125497044,2017-08-16,1,103665,False,BREAD/BAKERY,2712,1,Quito,Pichincha,D,13,46.8,2017-08-16_1


In [160]:
df_test3.isnull().sum()

id             0
date           0
store_nbr      0
item_nbr       0
onpromotion    0
family         0
class          0
perishable     0
city           0
state          0
store_type     0
cluster        0
dcoilwtico     0
ds             0
dtype: int64

In [161]:
df_test4 = pd.merge(df_test3, df_t1, how='left', on='ds')

In [162]:
df_test4.isnull().sum()

id              0
date            0
store_nbr       0
item_nbr        0
onpromotion     0
family          0
class           0
perishable      0
city            0
state           0
store_type      0
cluster         0
dcoilwtico      0
ds              0
transactions    0
dtype: int64

In [163]:
df_test4['transactions'] = df_test4['transactions'].astype('int')

In [164]:
df_test4 = df_test4.drop('ds', axis=1)

In [165]:
df_test4.shape

(3370464, 14)

In [166]:
# save the file as each step
feather.write_dataframe(df_test4, '../cache/test4_t.feather')

In [167]:
df_test4 = feather.read_dataframe('../cache/test4_t.feather')

In [168]:
df_test4.head()

Unnamed: 0,id,date,store_nbr,item_nbr,onpromotion,family,class,perishable,city,state,store_type,cluster,dcoilwtico,transactions
0,125497040,2017-08-16,1,96995,False,GROCERY I,1093,0,Quito,Pichincha,D,13,46.8,1538
1,125497041,2017-08-16,1,99197,False,GROCERY I,1067,0,Quito,Pichincha,D,13,46.8,1538
2,125497042,2017-08-16,1,103501,False,CLEANING,3008,0,Quito,Pichincha,D,13,46.8,1538
3,125497043,2017-08-16,1,103520,False,GROCERY I,1028,0,Quito,Pichincha,D,13,46.8,1538
4,125497044,2017-08-16,1,103665,False,BREAD/BAKERY,2712,1,Quito,Pichincha,D,13,46.8,1538


In [169]:
df_test4.shape

(3370464, 14)

In [170]:
df_h = pd.read_csv('../data/holidays_events.csv')

In [171]:
df_h.head()

Unnamed: 0,date,type,locale,locale_name,description,transferred
0,2012-03-02,Holiday,Local,Manta,Fundacion de Manta,False
1,2012-04-01,Holiday,Regional,Cotopaxi,Provincializacion de Cotopaxi,False
2,2012-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca,False
3,2012-04-14,Holiday,Local,Libertad,Cantonizacion de Libertad,False
4,2012-04-21,Holiday,Local,Riobamba,Cantonizacion de Riobamba,False


In [172]:
df_h.shape

(350, 6)

In [173]:
len(set(df_h.date))

312

In [174]:
len(set(df_test4.date))

16

In [175]:
pd.value_counts(df_h.type)

Holiday       221
Event          56
Additional     51
Transfer       12
Work Day        5
Bridge          5
Name: type, dtype: int64

In [176]:
pd.value_counts(df_h.locale)

National    174
Local       152
Regional     24
Name: locale, dtype: int64

In [177]:
pd.value_counts(df_h.transferred)

False    338
True      12
Name: transferred, dtype: int64

In [178]:
df_h[df_h.type == 'Work Day']

Unnamed: 0,date,type,locale,locale_name,description,transferred
42,2013-01-05,Work Day,National,Ecuador,Recupero puente Navidad,False
43,2013-01-12,Work Day,National,Ecuador,Recupero puente primer dia del ano,False
149,2014-12-20,Work Day,National,Ecuador,Recupero Puente Navidad,False
161,2015-01-10,Work Day,National,Ecuador,Recupero Puente Primer dia del ano,False
283,2016-11-12,Work Day,National,Ecuador,Recupero Puente Dia de Difuntos,False


In [179]:
df_h[(df_h.type == 'Holiday') & (df_h.transferred == True)]

Unnamed: 0,date,type,locale,locale_name,description,transferred
19,2012-10-09,Holiday,National,Ecuador,Independencia de Guayaquil,True
72,2013-10-09,Holiday,National,Ecuador,Independencia de Guayaquil,True
135,2014-10-09,Holiday,National,Ecuador,Independencia de Guayaquil,True
255,2016-05-24,Holiday,National,Ecuador,Batalla de Pichincha,True
266,2016-07-25,Holiday,Local,Guayaquil,Fundacion de Guayaquil,True
268,2016-08-10,Holiday,National,Ecuador,Primer Grito de Independencia,True
297,2017-01-01,Holiday,National,Ecuador,Primer dia del ano,True
303,2017-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca,True
312,2017-05-24,Holiday,National,Ecuador,Batalla de Pichincha,True
324,2017-08-10,Holiday,National,Ecuador,Primer Grito de Independencia,True


In [180]:
df_h.loc[0]

date                   2012-03-02
type                      Holiday
locale                      Local
locale_name                 Manta
description    Fundacion de Manta
transferred                 False
Name: 0, dtype: object

In [181]:
df_h.type[(df_h.type == 'Holiday') & (df_h.transferred == True)] = 'Work Day'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [182]:
df_h[(df_h.type == 'Holiday') & (df_h.transferred == True)]

Unnamed: 0,date,type,locale,locale_name,description,transferred


In [183]:
df_h_national = df_h[df_h.locale == 'National']

In [184]:
df_h_national.shape

(174, 6)

In [185]:
df_h_national = df_h_national.drop_duplicates(['date'], keep='first')

In [186]:
df_h_national.shape

(168, 6)

In [187]:
pd.value_counts(df_h_national.type)

Event         53
Holiday       52
Additional    37
Work Day      13
Transfer       8
Bridge         5
Name: type, dtype: int64

In [188]:
df_h_national.type[(df_h_national.type != 'Event') & (df_h_national.type != 'Work Day')] = 'Holiday'

In [189]:
pd.value_counts(df_h_national.type)

Holiday     102
Event        53
Work Day     13
Name: type, dtype: int64

In [190]:
df_h_national.head()

Unnamed: 0,date,type,locale,locale_name,description,transferred
14,2012-08-10,Holiday,National,Ecuador,Primer Grito de Independencia,False
19,2012-10-09,Work Day,National,Ecuador,Independencia de Guayaquil,True
20,2012-10-12,Holiday,National,Ecuador,Traslado Independencia de Guayaquil,False
21,2012-11-02,Holiday,National,Ecuador,Dia de Difuntos,False
22,2012-11-03,Holiday,National,Ecuador,Independencia de Cuenca,False


In [191]:
df_h_national = df_h_national.drop(['transferred', 'locale', 'description', 'locale_name'], axis=1)

In [192]:
df_h_national.head()

Unnamed: 0,date,type
14,2012-08-10,Holiday
19,2012-10-09,Work Day
20,2012-10-12,Holiday
21,2012-11-02,Holiday
22,2012-11-03,Holiday


In [193]:
df_test4['d1'] = df_test4['date'].values

In [194]:
df_test4["d1"] = (pd.to_datetime(df_test4["d1"]))

In [195]:
df_test4.tail()

Unnamed: 0,id,date,store_nbr,item_nbr,onpromotion,family,class,perishable,city,state,store_type,cluster,dcoilwtico,transactions,d1
3370459,128867499,2017-08-31,54,2132163,False,GROCERY I,1040,0,El Carmen,Manabi,C,3,47.26,763,2017-08-31
3370460,128867500,2017-08-31,54,2132318,False,GROCERY I,1002,0,El Carmen,Manabi,C,3,47.26,763,2017-08-31
3370461,128867501,2017-08-31,54,2132945,False,GROCERY I,1026,0,El Carmen,Manabi,C,3,47.26,763,2017-08-31
3370462,128867502,2017-08-31,54,2132957,False,GROCERY I,1068,0,El Carmen,Manabi,C,3,47.26,763,2017-08-31
3370463,128867503,2017-08-31,54,2134244,False,"LIQUOR,WINE,BEER",1364,0,El Carmen,Manabi,C,3,47.26,763,2017-08-31


In [196]:
df_test4["dom"] = df_test4["d1"].apply(lambda x: x.day)
df_test4["mon"] = df_test4["d1"].apply(lambda x: x.month)
df_test4["dow"] = df_test4["d1"].apply(lambda x: x.weekday())
df_test4["doy"] = df_test4["d1"].apply(lambda x: x.timetuple().tm_yday)
df_test4["dcount"] = df_test4["d1"].apply(lambda x: x.toordinal())

In [197]:
df_test4 = df_test4.drop('d1', axis=1)

In [198]:
df_test4.tail()

Unnamed: 0,id,date,store_nbr,item_nbr,onpromotion,family,class,perishable,city,state,store_type,cluster,dcoilwtico,transactions,dom,mon,dow,doy,dcount
3370459,128867499,2017-08-31,54,2132163,False,GROCERY I,1040,0,El Carmen,Manabi,C,3,47.26,763,31,8,3,243,736572
3370460,128867500,2017-08-31,54,2132318,False,GROCERY I,1002,0,El Carmen,Manabi,C,3,47.26,763,31,8,3,243,736572
3370461,128867501,2017-08-31,54,2132945,False,GROCERY I,1026,0,El Carmen,Manabi,C,3,47.26,763,31,8,3,243,736572
3370462,128867502,2017-08-31,54,2132957,False,GROCERY I,1068,0,El Carmen,Manabi,C,3,47.26,763,31,8,3,243,736572
3370463,128867503,2017-08-31,54,2134244,False,"LIQUOR,WINE,BEER",1364,0,El Carmen,Manabi,C,3,47.26,763,31,8,3,243,736572


In [199]:
np.min(df_test4['dow'])

0

In [200]:
np.max(df_test4['dow'])

6

In [201]:
df_h_national.head()

Unnamed: 0,date,type
14,2012-08-10,Holiday
19,2012-10-09,Work Day
20,2012-10-12,Holiday
21,2012-11-02,Holiday
22,2012-11-03,Holiday


In [209]:
df_h_national.tail(100)

Unnamed: 0,date,type
155,2014-12-25,Holiday
156,2014-12-26,Holiday
158,2014-12-31,Holiday
159,2015-01-01,Holiday
160,2015-01-02,Holiday
161,2015-01-10,Work Day
162,2015-02-16,Holiday
163,2015-02-17,Holiday
166,2015-04-03,Holiday
170,2015-05-01,Holiday


In [202]:
df_test5 = pd.merge(df_test4, df_h_national, how='left', on='date')

In [203]:
df_test5.shape

(3370464, 20)

In [204]:
df_test4.head()

Unnamed: 0,id,date,store_nbr,item_nbr,onpromotion,family,class,perishable,city,state,store_type,cluster,dcoilwtico,transactions,dom,mon,dow,doy,dcount
0,125497040,2017-08-16,1,96995,False,GROCERY I,1093,0,Quito,Pichincha,D,13,46.8,1538,16,8,2,228,736557
1,125497041,2017-08-16,1,99197,False,GROCERY I,1067,0,Quito,Pichincha,D,13,46.8,1538,16,8,2,228,736557
2,125497042,2017-08-16,1,103501,False,CLEANING,3008,0,Quito,Pichincha,D,13,46.8,1538,16,8,2,228,736557
3,125497043,2017-08-16,1,103520,False,GROCERY I,1028,0,Quito,Pichincha,D,13,46.8,1538,16,8,2,228,736557
4,125497044,2017-08-16,1,103665,False,BREAD/BAKERY,2712,1,Quito,Pichincha,D,13,46.8,1538,16,8,2,228,736557


In [205]:
df_test4.shape

(3370464, 19)

In [206]:
df_test5.head()

Unnamed: 0,id,date,store_nbr,item_nbr,onpromotion,family,class,perishable,city,state,store_type,cluster,dcoilwtico,transactions,dom,mon,dow,doy,dcount,type
0,125497040,2017-08-16,1,96995,False,GROCERY I,1093,0,Quito,Pichincha,D,13,46.8,1538,16,8,2,228,736557,
1,125497041,2017-08-16,1,99197,False,GROCERY I,1067,0,Quito,Pichincha,D,13,46.8,1538,16,8,2,228,736557,
2,125497042,2017-08-16,1,103501,False,CLEANING,3008,0,Quito,Pichincha,D,13,46.8,1538,16,8,2,228,736557,
3,125497043,2017-08-16,1,103520,False,GROCERY I,1028,0,Quito,Pichincha,D,13,46.8,1538,16,8,2,228,736557,
4,125497044,2017-08-16,1,103665,False,BREAD/BAKERY,2712,1,Quito,Pichincha,D,13,46.8,1538,16,8,2,228,736557,


In [207]:
df_test5.isnull().sum()
# type            114199365

id                    0
date                  0
store_nbr             0
item_nbr              0
onpromotion           0
family                0
class                 0
perishable            0
city                  0
state                 0
store_type            0
cluster               0
dcoilwtico            0
transactions          0
dom                   0
mon                   0
dow                   0
doy                   0
dcount                0
type            3370464
dtype: int64

In [210]:
df_test5[df_test5.type.isnull()].shape

(3370464, 20)

In [211]:
df_test5.type[(df_test5.type.isnull()) & (df_test5.dow == 5)] = 'Holiday'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [212]:
df_test5.type[(df_test5.type.isnull()) & (df_test5.dow == 6)] = 'Holiday'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [214]:
df_test5.isnull().sum()

id                    0
date                  0
store_nbr             0
item_nbr              0
onpromotion           0
family                0
class                 0
perishable            0
city                  0
state                 0
store_type            0
cluster               0
dcoilwtico            0
transactions          0
dom                   0
mon                   0
dow                   0
doy                   0
dcount                0
type            2527848
dtype: int64

In [215]:
df_test5.fillna('Work Day', inplace=True)

In [53]:
df_train7.isnull().sum()

id              0
date            0
store_nbr       0
item_nbr        0
unit_sales      0
onpromotion     0
family          0
class           0
perishable      0
city            0
state           0
store_type      0
cluster         0
dcoilwtico      0
transactions    0
dom             0
mon             0
dow             0
doy             0
dcount          0
type            0
dtype: int64

In [216]:
df_h_national = df_h[df_h.locale == 'National']

In [217]:
df_h_national.shape

(174, 6)

In [218]:
df_h_national.type[(df_h_national.type != 'Event') & (df_h_national.type != 'Work Day')] = 'Holiday'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


In [219]:
df_h_national.head()

Unnamed: 0,date,type,locale,locale_name,description,transferred
14,2012-08-10,Holiday,National,Ecuador,Primer Grito de Independencia,False
19,2012-10-09,Work Day,National,Ecuador,Independencia de Guayaquil,True
20,2012-10-12,Holiday,National,Ecuador,Traslado Independencia de Guayaquil,False
21,2012-11-02,Holiday,National,Ecuador,Dia de Difuntos,False
22,2012-11-03,Holiday,National,Ecuador,Independencia de Cuenca,False


In [220]:
df_h_national = df_h_national.drop(['transferred', 'locale', 'type', 'locale_name'], axis=1)

In [221]:
df_h_national.head()

Unnamed: 0,date,description
14,2012-08-10,Primer Grito de Independencia
19,2012-10-09,Independencia de Guayaquil
20,2012-10-12,Traslado Independencia de Guayaquil
21,2012-11-02,Dia de Difuntos
22,2012-11-03,Independencia de Cuenca


In [222]:
df_test6 = pd.merge(df_test5, df_h_national, how='left', on='date')

In [223]:
df_test6.isnull().sum()

id                    0
date                  0
store_nbr             0
item_nbr              0
onpromotion           0
family                0
class                 0
perishable            0
city                  0
state                 0
store_type            0
cluster               0
dcoilwtico            0
transactions          0
dom                   0
mon                   0
dow                   0
doy                   0
dcount                0
type                  0
description     3370464
dtype: int64

In [224]:
df_test6.type[(df_test6.description.isnull()) & (df_test6.dow == 5)] = 'Weekend'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [226]:
df_test6.type[(df_test6.description.isnull()) & (df_test6.dow == 6)] = 'Weekend'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [227]:
df_test6.fillna('Normal', inplace=True)

In [228]:
df_test6.isnull().sum()

id              0
date            0
store_nbr       0
item_nbr        0
onpromotion     0
family          0
class           0
perishable      0
city            0
state           0
store_type      0
cluster         0
dcoilwtico      0
transactions    0
dom             0
mon             0
dow             0
doy             0
dcount          0
type            0
description     0
dtype: int64

In [229]:
df_test6.shape

(3370464, 21)

In [230]:
df_test6.head()

Unnamed: 0,id,date,store_nbr,item_nbr,onpromotion,family,class,perishable,city,state,...,cluster,dcoilwtico,transactions,dom,mon,dow,doy,dcount,type,description
0,125497040,2017-08-16,1,96995,False,GROCERY I,1093,0,Quito,Pichincha,...,13,46.8,1538,16,8,2,228,736557,Work Day,Normal
1,125497041,2017-08-16,1,99197,False,GROCERY I,1067,0,Quito,Pichincha,...,13,46.8,1538,16,8,2,228,736557,Work Day,Normal
2,125497042,2017-08-16,1,103501,False,CLEANING,3008,0,Quito,Pichincha,...,13,46.8,1538,16,8,2,228,736557,Work Day,Normal
3,125497043,2017-08-16,1,103520,False,GROCERY I,1028,0,Quito,Pichincha,...,13,46.8,1538,16,8,2,228,736557,Work Day,Normal
4,125497044,2017-08-16,1,103665,False,BREAD/BAKERY,2712,1,Quito,Pichincha,...,13,46.8,1538,16,8,2,228,736557,Work Day,Normal


In [231]:
feather.write_dataframe(df_test6, '../cache/test6_t.feather')

In [3]:
df_test6 = feather.read_dataframe('../cache/test6_t.feather')

In [4]:
df_h = pd.read_csv('../data/holidays_events.csv')

In [5]:
df_h_local = df_h[df_h.locale == 'Local']

In [6]:
df_h_local.shape

(152, 6)

In [7]:
len(set(df_h_local.locale_name))

19

In [8]:
len(set(df_test6.city))

22

In [9]:
set(df_test6.city) - set(df_h_local.locale_name)

{'Babahoyo', 'Daule', 'Playas'}

In [10]:
df_h_local = df_h_local.drop_duplicates(['date'], keep='first')

In [11]:
df_h_local.shape

(138, 6)

In [12]:
df_h_local.tail(100)

Unnamed: 0,date,type,locale,locale_name,description,transferred
79,2013-11-11,Holiday,Local,Latacunga,Independencia de Latacunga,False
80,2013-11-12,Holiday,Local,Ambato,Independencia de Ambato,False
81,2013-12-05,Additional,Local,Quito,Fundacion de Quito-1,False
82,2013-12-06,Holiday,Local,Quito,Fundacion de Quito,False
83,2013-12-08,Holiday,Local,Loja,Fundacion de Loja,False
86,2013-12-22,Holiday,Local,Salinas,Cantonizacion de Salinas,False
93,2014-03-02,Holiday,Local,Manta,Fundacion de Manta,False
97,2014-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca,False
98,2014-04-14,Holiday,Local,Libertad,Cantonizacion de Libertad,False
100,2014-04-21,Holiday,Local,Riobamba,Cantonizacion de Riobamba,False


In [13]:
df_h_l1 = df_h_local.set_index('date')

In [15]:
df_h_l1.loc['2017-08-24']

type                       Holiday
locale                       Local
locale_name                 Ambato
description    Fundacion de Ambato
transferred                  False
Name: 2017-08-24, dtype: object

In [16]:
import os
x = {i for i in range(20)}
os.sched_setaffinity(0, x)

In [17]:
def set_train_type_desc_local(date):
    t = df_h_l1.loc[date].type
    d = df_h_l1.loc[date].description
    c = df_h_l1.loc[date].locale_name

    df_test6.type[(df_test6.date == date) & (df_test6.city == c)] = t
    df_test6.description[(df_test6.date == date) & (df_test6.city == c)] = d

In [18]:
for date in tqdm(df_h_local.date):
    print(date)
    set_train_type_desc_local(date) 

  0%|          | 0/138 [00:00<?, ?it/s]

2012-03-02


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
  1%|          | 1/138 [00:02<04:37,  2.02s/it]

2012-04-12


  1%|▏         | 2/138 [00:03<03:54,  1.73s/it]

2012-04-14


  2%|▏         | 3/138 [00:05<03:45,  1.67s/it]

2012-04-21


  3%|▎         | 4/138 [00:06<03:44,  1.68s/it]

2012-05-12


  4%|▎         | 5/138 [00:08<03:43,  1.68s/it]

2012-06-23


  4%|▍         | 6/138 [00:09<03:34,  1.63s/it]

2012-06-25


  5%|▌         | 7/138 [00:11<03:33,  1.63s/it]

2012-07-03


  6%|▌         | 8/138 [00:13<03:33,  1.64s/it]

2012-07-23


  7%|▋         | 9/138 [00:14<03:30,  1.64s/it]

2012-08-05


  7%|▋         | 10/138 [00:16<03:33,  1.67s/it]

2012-08-15


  8%|▊         | 11/138 [00:18<03:30,  1.65s/it]

2012-08-24


  9%|▊         | 12/138 [00:20<03:30,  1.67s/it]

2012-09-28


  9%|▉         | 13/138 [00:21<03:30,  1.69s/it]

2012-10-07


 10%|█         | 14/138 [00:23<03:29,  1.69s/it]

2012-11-10


 11%|█         | 15/138 [00:25<03:27,  1.69s/it]

2012-11-11


 12%|█▏        | 16/138 [00:26<03:25,  1.69s/it]

2012-11-12


 12%|█▏        | 17/138 [00:28<03:24,  1.69s/it]

2012-12-05


 13%|█▎        | 18/138 [00:30<03:21,  1.68s/it]

2012-12-06


 14%|█▍        | 19/138 [00:31<03:18,  1.67s/it]

2012-12-08


 14%|█▍        | 20/138 [00:33<03:15,  1.66s/it]

2012-12-22


 15%|█▌        | 21/138 [00:34<03:12,  1.65s/it]

2013-03-02


 16%|█▌        | 22/138 [00:35<03:09,  1.63s/it]

2013-04-12


 17%|█▋        | 23/138 [00:37<03:07,  1.63s/it]

2013-04-14


 17%|█▋        | 24/138 [00:39<03:06,  1.64s/it]

2013-04-21


 18%|█▊        | 25/138 [00:41<03:06,  1.65s/it]

2013-05-12


 19%|█▉        | 26/138 [00:42<03:05,  1.65s/it]

2013-06-23


 20%|█▉        | 27/138 [00:44<03:03,  1.66s/it]

2013-06-25


 20%|██        | 28/138 [00:46<03:02,  1.66s/it]

2013-07-03


 21%|██        | 29/138 [00:48<03:00,  1.66s/it]

2013-07-23


 22%|██▏       | 30/138 [00:49<02:57,  1.65s/it]

2013-07-24


 22%|██▏       | 31/138 [00:50<02:55,  1.64s/it]

2013-07-25


 23%|██▎       | 32/138 [00:52<02:52,  1.63s/it]

2013-08-05


 24%|██▍       | 33/138 [00:53<02:50,  1.62s/it]

2013-08-15


 25%|██▍       | 34/138 [00:55<02:48,  1.62s/it]

2013-08-24


 25%|██▌       | 35/138 [00:56<02:47,  1.63s/it]

2013-09-28


 26%|██▌       | 36/138 [00:58<02:46,  1.63s/it]

2013-10-07


 27%|██▋       | 37/138 [01:00<02:44,  1.63s/it]

2013-11-10


 28%|██▊       | 38/138 [01:02<02:43,  1.64s/it]

2013-11-11


 28%|██▊       | 39/138 [01:03<02:42,  1.64s/it]

2013-11-12


 29%|██▉       | 40/138 [01:05<02:40,  1.64s/it]

2013-12-05


 30%|██▉       | 41/138 [01:06<02:38,  1.63s/it]

2013-12-06


 30%|███       | 42/138 [01:08<02:36,  1.63s/it]

2013-12-08


 31%|███       | 43/138 [01:10<02:35,  1.64s/it]

2013-12-22


 32%|███▏      | 44/138 [01:12<02:34,  1.65s/it]

2014-03-02


 33%|███▎      | 45/138 [01:14<02:33,  1.65s/it]

2014-04-12


 33%|███▎      | 46/138 [01:15<02:31,  1.65s/it]

2014-04-14


 34%|███▍      | 47/138 [01:17<02:30,  1.66s/it]

2014-04-21


 35%|███▍      | 48/138 [01:19<02:29,  1.66s/it]

2014-05-12


 36%|███▌      | 49/138 [01:21<02:27,  1.66s/it]

2014-06-23


 36%|███▌      | 50/138 [01:22<02:25,  1.65s/it]

2014-06-25


 37%|███▋      | 51/138 [01:23<02:23,  1.65s/it]

2014-07-03


 38%|███▊      | 52/138 [01:25<02:21,  1.64s/it]

2014-07-23


 38%|███▊      | 53/138 [01:26<02:19,  1.64s/it]

2014-07-24


 39%|███▉      | 54/138 [01:28<02:17,  1.63s/it]

2014-07-25


 40%|███▉      | 55/138 [01:29<02:15,  1.63s/it]

2014-08-05


 41%|████      | 56/138 [01:30<02:13,  1.62s/it]

2014-08-15


 41%|████▏     | 57/138 [01:32<02:11,  1.62s/it]

2014-08-24


 42%|████▏     | 58/138 [01:33<02:09,  1.62s/it]

2014-09-28


 43%|████▎     | 59/138 [01:35<02:07,  1.61s/it]

2014-10-07


 43%|████▎     | 60/138 [01:36<02:05,  1.61s/it]

2014-11-10


 44%|████▍     | 61/138 [01:38<02:03,  1.61s/it]

2014-11-11


 45%|████▍     | 62/138 [01:39<02:01,  1.60s/it]

2014-11-12


 46%|████▌     | 63/138 [01:40<02:00,  1.60s/it]

2014-12-05


 46%|████▋     | 64/138 [01:42<01:58,  1.60s/it]

2014-12-06


 47%|████▋     | 65/138 [01:43<01:56,  1.60s/it]

2014-12-08


 48%|████▊     | 66/138 [01:45<01:54,  1.59s/it]

2014-12-22


 49%|████▊     | 67/138 [01:46<01:52,  1.59s/it]

2015-03-02


 49%|████▉     | 68/138 [01:47<01:51,  1.59s/it]

2015-04-12


 50%|█████     | 69/138 [01:49<01:49,  1.59s/it]

2015-04-14


 51%|█████     | 70/138 [01:50<01:47,  1.58s/it]

2015-04-21


 51%|█████▏    | 71/138 [01:52<01:45,  1.58s/it]

2015-05-12


 52%|█████▏    | 72/138 [01:53<01:44,  1.58s/it]

2015-06-23


 53%|█████▎    | 73/138 [01:54<01:42,  1.57s/it]

2015-06-25


 54%|█████▎    | 74/138 [01:56<01:40,  1.57s/it]

2015-07-03


 54%|█████▍    | 75/138 [01:57<01:38,  1.57s/it]

2015-07-23


 55%|█████▌    | 76/138 [01:59<01:37,  1.57s/it]

2015-07-24


 56%|█████▌    | 77/138 [02:00<01:35,  1.57s/it]

2015-07-25


 57%|█████▋    | 78/138 [02:01<01:33,  1.56s/it]

2015-08-05


 57%|█████▋    | 79/138 [02:03<01:32,  1.56s/it]

2015-08-15


 58%|█████▊    | 80/138 [02:04<01:30,  1.56s/it]

2015-08-24


 59%|█████▊    | 81/138 [02:06<01:28,  1.56s/it]

2015-09-28


 59%|█████▉    | 82/138 [02:07<01:27,  1.56s/it]

2015-10-07


 60%|██████    | 83/138 [02:08<01:25,  1.55s/it]

2015-11-10


 61%|██████    | 84/138 [02:10<01:23,  1.55s/it]

2015-11-11


 62%|██████▏   | 85/138 [02:12<01:22,  1.56s/it]

2015-11-12


 62%|██████▏   | 86/138 [02:13<01:20,  1.56s/it]

2015-12-05


 63%|██████▎   | 87/138 [02:15<01:19,  1.56s/it]

2015-12-06


 64%|██████▍   | 88/138 [02:17<01:18,  1.56s/it]

2015-12-08


 64%|██████▍   | 89/138 [02:19<01:16,  1.56s/it]

2015-12-22


 65%|██████▌   | 90/138 [02:20<01:14,  1.56s/it]

2016-03-02


 66%|██████▌   | 91/138 [02:22<01:13,  1.56s/it]

2016-04-12


 67%|██████▋   | 92/138 [02:23<01:11,  1.56s/it]

2016-04-14


 67%|██████▋   | 93/138 [02:25<01:10,  1.56s/it]

2016-04-21


 68%|██████▊   | 94/138 [02:27<01:08,  1.56s/it]

2016-05-12


 69%|██████▉   | 95/138 [02:28<01:07,  1.57s/it]

2016-06-23


 70%|██████▉   | 96/138 [02:30<01:05,  1.57s/it]

2016-06-25


 70%|███████   | 97/138 [02:32<01:04,  1.57s/it]

2016-07-03


 71%|███████   | 98/138 [02:34<01:02,  1.57s/it]

2016-07-23


 72%|███████▏  | 99/138 [02:35<01:01,  1.57s/it]

2016-07-24


 72%|███████▏  | 100/138 [02:37<00:59,  1.57s/it]

2016-07-25


 73%|███████▎  | 101/138 [02:38<00:58,  1.57s/it]

2016-08-05


 74%|███████▍  | 102/138 [02:40<00:56,  1.57s/it]

2016-08-15


 75%|███████▍  | 103/138 [02:41<00:54,  1.57s/it]

2016-08-24


 75%|███████▌  | 104/138 [02:43<00:53,  1.57s/it]

2016-09-28


 76%|███████▌  | 105/138 [02:44<00:51,  1.57s/it]

2016-10-07


 77%|███████▋  | 106/138 [02:46<00:50,  1.57s/it]

2016-11-10


 78%|███████▊  | 107/138 [02:48<00:48,  1.57s/it]

2016-11-11


 78%|███████▊  | 108/138 [02:49<00:47,  1.57s/it]

2016-11-12


 79%|███████▉  | 109/138 [02:51<00:45,  1.57s/it]

2016-12-05


 80%|███████▉  | 110/138 [02:53<00:44,  1.57s/it]

2016-12-06


 80%|████████  | 111/138 [02:54<00:42,  1.57s/it]

2016-12-08


 81%|████████  | 112/138 [02:56<00:40,  1.57s/it]

2016-12-22


 82%|████████▏ | 113/138 [02:58<00:39,  1.58s/it]

2017-03-02


 83%|████████▎ | 114/138 [02:59<00:37,  1.58s/it]

2017-04-12


 83%|████████▎ | 115/138 [03:01<00:36,  1.57s/it]

2017-04-13


 84%|████████▍ | 116/138 [03:02<00:34,  1.57s/it]

2017-04-14


 85%|████████▍ | 117/138 [03:04<00:33,  1.58s/it]

2017-04-21


 86%|████████▌ | 118/138 [03:05<00:31,  1.58s/it]

2017-05-12


 86%|████████▌ | 119/138 [03:07<00:29,  1.58s/it]

2017-06-23


 87%|████████▋ | 120/138 [03:08<00:28,  1.57s/it]

2017-06-25


 88%|████████▊ | 121/138 [03:10<00:26,  1.57s/it]

2017-07-03


 88%|████████▊ | 122/138 [03:11<00:25,  1.57s/it]

2017-07-23


 89%|████████▉ | 123/138 [03:13<00:23,  1.57s/it]

2017-07-24


 90%|████████▉ | 124/138 [03:14<00:21,  1.57s/it]

2017-07-25


 91%|█████████ | 125/138 [03:16<00:20,  1.57s/it]

2017-08-05


 91%|█████████▏| 126/138 [03:17<00:18,  1.57s/it]

2017-08-15


 92%|█████████▏| 127/138 [03:18<00:17,  1.57s/it]

2017-08-24


 93%|█████████▎| 128/138 [03:20<00:15,  1.57s/it]

2017-09-28


 93%|█████████▎| 129/138 [03:21<00:14,  1.57s/it]

2017-09-29


 94%|█████████▍| 130/138 [03:23<00:12,  1.57s/it]

2017-10-07


 95%|█████████▍| 131/138 [03:25<00:10,  1.57s/it]

2017-11-10


 96%|█████████▌| 132/138 [03:26<00:09,  1.56s/it]

2017-11-11


 96%|█████████▋| 133/138 [03:27<00:07,  1.56s/it]

2017-11-12


 97%|█████████▋| 134/138 [03:29<00:06,  1.56s/it]

2017-12-05


 98%|█████████▊| 135/138 [03:30<00:04,  1.56s/it]

2017-12-06


 99%|█████████▊| 136/138 [03:31<00:03,  1.56s/it]

2017-12-08


 99%|█████████▉| 137/138 [03:33<00:01,  1.56s/it]

2017-12-22


100%|██████████| 138/138 [03:35<00:00,  1.56s/it]


In [246]:
df_h_regional = df_h[df_h.locale == 'Regional']

In [247]:
df_h_regional.shape

(24, 6)

In [248]:
len(set(df_h_regional.locale_name))

4

In [249]:
len(set(df_test6.state))

16

In [250]:
set(df_test6.state) - set(df_h_regional.locale_name)

{'Azuay',
 'Bolivar',
 'Chimborazo',
 'El Oro',
 'Esmeraldas',
 'Guayas',
 'Loja',
 'Los Rios',
 'Manabi',
 'Pastaza',
 'Pichincha',
 'Tungurahua'}

In [251]:
len(set(df_test6.state) - set(df_h_regional.locale_name))

12

In [252]:
df_h_regional = df_h_regional.drop_duplicates(['date'], keep='first')

In [253]:
df_h_regional.shape

(24, 6)

In [254]:
df_h_regional.head()

Unnamed: 0,date,type,locale,locale_name,description,transferred
1,2012-04-01,Holiday,Regional,Cotopaxi,Provincializacion de Cotopaxi,False
7,2012-06-25,Holiday,Regional,Imbabura,Provincializacion de Imbabura,False
23,2012-11-06,Holiday,Regional,Santo Domingo de los Tsachilas,Provincializacion de Santo Domingo,False
24,2012-11-07,Holiday,Regional,Santa Elena,Provincializacion Santa Elena,False
47,2013-04-01,Holiday,Regional,Cotopaxi,Provincializacion de Cotopaxi,False


In [255]:
df_h_r1 = df_h_regional.set_index('date')

In [257]:
def set_test_type_desc_regional(date):
    t = df_h_r1.loc[date].type
    d = df_h_r1.loc[date].description
    s = df_h_r1.loc[date].locale_name

    df_test6.type[(df_test6.date == date) & (df_test6.state == s)] = t
    df_test6.description[(df_test6.date == date) & (df_test6.state == s)] = d

In [258]:
for date in tqdm(df_h_regional.date):
    set_test_type_desc_regional(date) 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
100%|██████████| 24/24 [00:37<00:00,  1.57s/it]


In [259]:
df_test6.tail()

Unnamed: 0,id,date,store_nbr,item_nbr,onpromotion,family,class,perishable,city,state,...,cluster,dcoilwtico,transactions,dom,mon,dow,doy,dcount,type,description
3370459,128867499,2017-08-31,54,2132163,False,GROCERY I,1040,0,El Carmen,Manabi,...,3,47.26,763,31,8,3,243,736572,Work Day,Normal
3370460,128867500,2017-08-31,54,2132318,False,GROCERY I,1002,0,El Carmen,Manabi,...,3,47.26,763,31,8,3,243,736572,Work Day,Normal
3370461,128867501,2017-08-31,54,2132945,False,GROCERY I,1026,0,El Carmen,Manabi,...,3,47.26,763,31,8,3,243,736572,Work Day,Normal
3370462,128867502,2017-08-31,54,2132957,False,GROCERY I,1068,0,El Carmen,Manabi,...,3,47.26,763,31,8,3,243,736572,Work Day,Normal
3370463,128867503,2017-08-31,54,2134244,False,"LIQUOR,WINE,BEER",1364,0,El Carmen,Manabi,...,3,47.26,763,31,8,3,243,736572,Work Day,Normal


In [260]:
df_test6.shape

(3370464, 21)

In [262]:
feather.write_dataframe(df_test6, '../cache/test6_t.feather')

In [263]:
df_test6['pd'] = 0 # payday

In [264]:
df_test6.pd[(df_test6.dom == 15)]= 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [265]:
df_test6.pd[(df_test6.dom == 31) & (df_test6.mon == 1)]= 1
df_test6.pd[(df_test6.dom == 31) & (df_test6.mon == 3)]= 1
df_test6.pd[(df_test6.dom == 30) & (df_test6.mon == 4)]= 1
df_test6.pd[(df_test6.dom == 31) & (df_test6.mon == 5)]= 1
df_test6.pd[(df_test6.dom == 30) & (df_test6.mon == 6)]= 1
df_test6.pd[(df_test6.dom == 31) & (df_test6.mon == 7)]= 1
df_test6.pd[(df_test6.dom == 31) & (df_test6.mon == 8)]= 1
df_test6.pd[(df_test6.dom == 30) & (df_test6.mon == 9)]= 1
df_test6.pd[(df_test6.dom == 31) & (df_test6.mon == 10)]= 1
df_test6.pd[(df_test6.dom == 30) & (df_test6.mon == 11)]= 1
df_test6.pd[(df_test6.dom == 31) & (df_test6.mon == 12)]= 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

Se

In [38]:
df_test6.pd[df_test6.date == '2013-02-28']= 1
df_test6.pd[df_test6.date == '2014-02-28']= 1
df_test6.pd[df_test6.date == '2015-02-28']= 1
df_test6.pd[df_test6.date == '2016-02-29']= 1
df_test6.pd[df_test6.date == '2017-02-28']= 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

Se

In [266]:
df_test6.shape

(3370464, 22)

In [267]:
df_test6.isnull().sum()

id              0
date            0
store_nbr       0
item_nbr        0
onpromotion     0
family          0
class           0
perishable      0
city            0
state           0
store_type      0
cluster         0
dcoilwtico      0
transactions    0
dom             0
mon             0
dow             0
doy             0
dcount          0
type            0
description     0
pd              0
dtype: int64

In [269]:
def weeks_before_earthquake(date):
    if np.datetime64('2016-04-16') <= np.datetime64(date):
        return 0
    return int(np.ceil((np.datetime64('2016-04-16') - np.datetime64(date))/np.timedelta64(7, 'D')))

In [270]:
def weeks_after_earthquake(date):
    if np.datetime64('2016-04-16') >= np.datetime64(date):
        return 0
    return int(np.ceil((np.datetime64(date) - np.datetime64('2016-04-16'))/np.timedelta64(7, 'D')))

In [271]:
df_test6['wbe'] = 0
df_test6['wae'] = 0

In [272]:
df_test6['wbe'] = df_test6['date'].map(lambda d: weeks_before_earthquake(d))

In [273]:
df_test6['wae'] = df_test6['date'].map(lambda d: weeks_after_earthquake(d))

In [274]:
df_test6.head()

Unnamed: 0,id,date,store_nbr,item_nbr,onpromotion,family,class,perishable,city,state,...,dom,mon,dow,doy,dcount,type,description,pd,wbe,wae
0,125497040,2017-08-16,1,96995,False,GROCERY I,1093,0,Quito,Pichincha,...,16,8,2,228,736557,Work Day,Normal,0,0,70
1,125497041,2017-08-16,1,99197,False,GROCERY I,1067,0,Quito,Pichincha,...,16,8,2,228,736557,Work Day,Normal,0,0,70
2,125497042,2017-08-16,1,103501,False,CLEANING,3008,0,Quito,Pichincha,...,16,8,2,228,736557,Work Day,Normal,0,0,70
3,125497043,2017-08-16,1,103520,False,GROCERY I,1028,0,Quito,Pichincha,...,16,8,2,228,736557,Work Day,Normal,0,0,70
4,125497044,2017-08-16,1,103665,False,BREAD/BAKERY,2712,1,Quito,Pichincha,...,16,8,2,228,736557,Work Day,Normal,0,0,70


In [275]:
df_test6[df_test6.date == '2016-04-16'].shape

(0, 24)

In [276]:
df_test6[df_test6.date == '2017-08-17'].head()

Unnamed: 0,id,date,store_nbr,item_nbr,onpromotion,family,class,perishable,city,state,...,dom,mon,dow,doy,dcount,type,description,pd,wbe,wae
210654,125707694,2017-08-17,1,96995,False,GROCERY I,1093,0,Quito,Pichincha,...,17,8,3,229,736558,Work Day,Normal,0,0,70
210655,125707695,2017-08-17,1,99197,False,GROCERY I,1067,0,Quito,Pichincha,...,17,8,3,229,736558,Work Day,Normal,0,0,70
210656,125707696,2017-08-17,1,103501,False,CLEANING,3008,0,Quito,Pichincha,...,17,8,3,229,736558,Work Day,Normal,0,0,70
210657,125707697,2017-08-17,1,103520,False,GROCERY I,1028,0,Quito,Pichincha,...,17,8,3,229,736558,Work Day,Normal,0,0,70
210658,125707698,2017-08-17,1,103665,False,BREAD/BAKERY,2712,1,Quito,Pichincha,...,17,8,3,229,736558,Work Day,Normal,0,0,70


In [277]:
df_test6['wfe'] = df_test6['wbe'].astype(int) + df_test6['wae'].astype(int)

In [278]:
df_test6.head()

Unnamed: 0,id,date,store_nbr,item_nbr,onpromotion,family,class,perishable,city,state,...,mon,dow,doy,dcount,type,description,pd,wbe,wae,wfe
0,125497040,2017-08-16,1,96995,False,GROCERY I,1093,0,Quito,Pichincha,...,8,2,228,736557,Work Day,Normal,0,0,70,70
1,125497041,2017-08-16,1,99197,False,GROCERY I,1067,0,Quito,Pichincha,...,8,2,228,736557,Work Day,Normal,0,0,70,70
2,125497042,2017-08-16,1,103501,False,CLEANING,3008,0,Quito,Pichincha,...,8,2,228,736557,Work Day,Normal,0,0,70,70
3,125497043,2017-08-16,1,103520,False,GROCERY I,1028,0,Quito,Pichincha,...,8,2,228,736557,Work Day,Normal,0,0,70,70
4,125497044,2017-08-16,1,103665,False,BREAD/BAKERY,2712,1,Quito,Pichincha,...,8,2,228,736557,Work Day,Normal,0,0,70,70


In [279]:
feather.write_dataframe(df_test6, '../cache/test7_t.feather')

In [280]:
df_test7 = feather.read_dataframe('../cache/test7_t.feather')

In [281]:
df_test7.shape

(3370464, 25)

In [282]:
df_test7 = df_test7.drop_duplicates(['id'], keep='first')

In [283]:
df_test7.shape

(3370464, 25)

In [284]:
# feather.write_dataframe(df_train10, '../cache/train10.feather')

In [285]:
# create a sample data frame

In [286]:
df_10 = df_test7.sample(frac=0.1, replace=False)

In [287]:
df_10.shape

(337046, 25)

In [288]:
feather.write_dataframe(df_10, '../cache/test7_sample_10_t.feather')

In [289]:
df_1 = df_test7.sample(frac=0.01, replace=False)

In [290]:
df_1.shape

(33705, 25)

In [291]:
feather.write_dataframe(df_1, '../cache/test7_sample_1_t.feather')