In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

sns.set(rc={'figure.figsize':(14.7,8.27)})

In [2]:
store = pd.read_csv(
    "data/store.csv"
)
new_train = pd.read_csv("data/new_train.csv",index_col=0, dtype={"StateHoliday": object})
data = store.merge(new_train,on="Store")
data["Date"] = pd.to_datetime(data["Date"])
data["CompetitionDistance"].fillna(-1, inplace=True)

data["HasCompetition"] = np.where(data["CompetitionOpenSinceMonth"] >= 0, 1, 0)

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 864627 entries, 0 to 864626
Data columns (total 19 columns):
Store                        864627 non-null int64
StoreType                    864627 non-null object
Assortment                   864627 non-null object
CompetitionDistance          864627 non-null float64
CompetitionOpenSinceMonth    589729 non-null float64
CompetitionOpenSinceYear     589729 non-null float64
Promo2                       864627 non-null int64
Promo2SinceWeek              431040 non-null float64
Promo2SinceYear              431040 non-null float64
PromoInterval                431040 non-null object
DayOfWeek                    864627 non-null int64
Date                         864627 non-null datetime64[ns]
Sales                        864627 non-null int64
Customers                    864627 non-null int64
Open                         864627 non-null int64
Promo                        864627 non-null int64
StateHoliday                 864627 non-null object

In [4]:
int_cols = ["CompetitionOpenSinceYear", "CompetitionOpenSinceMonth", "Promo2SinceYear",
            "Promo2SinceWeek", 
           ]
data[int_cols] = data[int_cols].fillna(-1)
data[int_cols] = data[int_cols].astype(int)

In [5]:
data.groupby("Store").mean().sort_values(by="Sales", ascending=False).head()

# Open/Closed -> high
# DayOfWeek -> medium
# Holiday -> medium
# Store -> high

Unnamed: 0_level_0,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,DayOfWeek,Sales,Customers,Open,Promo,SchoolHoliday,HasCompetition
Store,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
262,1180.0,5.0,2013.0,0.0,-1.0,-1.0,4.0,20771.417391,3407.146584,1.0,0.378882,0.182609,1.0
562,1210.0,-1.0,-1.0,0.0,-1.0,-1.0,4.0,18006.562733,3104.18882,1.0,0.378882,0.23354,0.0
817,140.0,3.0,2006.0,0.0,-1.0,-1.0,4.0,17789.629814,2557.583851,0.83354,0.378882,0.190062,1.0
1114,870.0,-1.0,-1.0,0.0,-1.0,-1.0,4.0,17394.627329,2695.827329,0.83354,0.378882,0.182609,0.0
251,340.0,-1.0,-1.0,0.0,-1.0,-1.0,4.0,15797.045963,2031.773913,0.827329,0.378882,0.193789,0.0


In [6]:
from sklearn.model_selection import train_test_split

data = pd.get_dummies(data)

X=data.drop(["Sales","Date"], axis=1)
y=data["Sales"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 70% training and 30% test

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 864627 entries, 0 to 864626
Data columns (total 29 columns):
Store                             864627 non-null int64
CompetitionDistance               864627 non-null float64
CompetitionOpenSinceMonth         864627 non-null int64
CompetitionOpenSinceYear          864627 non-null int64
Promo2                            864627 non-null int64
Promo2SinceWeek                   864627 non-null int64
Promo2SinceYear                   864627 non-null int64
DayOfWeek                         864627 non-null int64
Date                              864627 non-null datetime64[ns]
Sales                             864627 non-null int64
Customers                         864627 non-null int64
Open                              864627 non-null int64
Promo                             864627 non-null int64
SchoolHoliday                     864627 non-null int64
HasCompetition                    864627 non-null int64
StoreType_a                       8646

In [7]:
from sklearn.tree import DecisionTreeClassifier

In [8]:
dtree = DecisionTreeClassifier()

In [9]:
dtree.fit(X_train, y_train)

MemoryError: could not allocate 20989345792 bytes

In [10]:
data.isna().any()

Store                             False
CompetitionDistance               False
CompetitionOpenSinceMonth         False
CompetitionOpenSinceYear          False
Promo2                            False
Promo2SinceWeek                   False
Promo2SinceYear                   False
DayOfWeek                         False
Date                              False
Sales                             False
Customers                         False
Open                              False
Promo                             False
SchoolHoliday                     False
HasCompetition                    False
StoreType_a                       False
StoreType_b                       False
StoreType_c                       False
StoreType_d                       False
Assortment_a                      False
Assortment_b                      False
Assortment_c                      False
PromoInterval_Feb,May,Aug,Nov     False
PromoInterval_Jan,Apr,Jul,Oct     False
PromoInterval_Mar,Jun,Sept,Dec    False


In [11]:
data.shape

(864627, 29)

In [12]:
data_samp = data.sample(frac=0.01)

In [15]:
data_samp.shape

(8646, 29)

In [20]:
print(data_samp["Date"].min())
print(data_samp["Date"].max())
print(data["Date"].min())
print(data["Date"].max())


2013-05-17 00:00:00
2015-07-31 00:00:00
2013-05-17 00:00:00
2015-07-31 00:00:00


In [22]:
data["Date"]

0        2015-07-31
1        2015-07-30
2        2015-07-29
3        2015-07-28
4        2015-07-27
5        2015-07-26
6        2015-07-25
7        2015-07-24
8        2015-07-23
9        2015-07-22
10       2015-07-21
11       2015-07-20
12       2015-07-19
13       2015-07-18
14       2015-07-17
15       2015-07-16
16       2015-07-15
17       2015-07-14
18       2015-07-13
19       2015-07-12
20       2015-07-11
21       2015-07-10
22       2015-07-09
23       2015-07-08
24       2015-07-07
25       2015-07-06
26       2015-07-05
27       2015-07-04
28       2015-07-03
29       2015-07-02
            ...    
864597   2013-06-16
864598   2013-06-15
864599   2013-06-14
864600   2013-06-13
864601   2013-06-12
864602   2013-06-11
864603   2013-06-10
864604   2013-06-09
864605   2013-06-08
864606   2013-06-07
864607   2013-06-06
864608   2013-06-05
864609   2013-06-04
864610   2013-06-03
864611   2013-06-02
864612   2013-06-01
864613   2013-05-31
864614   2013-05-30
864615   2013-05-29


In [43]:
WorkHol = data.loc[(data["Date"].dt.month == 5) & (data["Date"]\
        .dt.day == 1),:].drop_duplicates("Store").reset_index(drop=True)

In [49]:
MayDay = data.loc[(data["Date"].dt.month == 1) & (data["Date"].dt.day == 1),:]

NewYear = data.loc[(data["Date"].dt.month == 1) & (data["Date"].dt.day == 1),:]

Germany = data.loc[(data["Date"].dt.month == 10) & (data["Date"].dt.day == 3),:]

Bulgaria = data.loc[(data["Date"].dt.month == 9) & (data["Date"].dt.day == 23),:]

MD = data.loc[(data["Date"].dt.month == 1) & (data["Date"].dt.day == 8),:]

Romania = data.loc[(data["Date"].dt.month == 1) & (data["Date"].dt.day == 24),:]

Estland = data.loc[(data["Date"].dt.month == 2) & (data["Date"].dt.day == 24),:]

Finland = data.loc[(data["Date"].dt.month == 2) & (data["Date"].dt.day == 2),:]

SRB = data.loc[(data["Date"].dt.month == 2) & (data["Date"].dt.day == 15),:]

AND = data.loc[(data["Date"].dt.month == 3) & (data["Date"].dt.day == 14),:]

BIH = data.loc[(data["Date"].dt.month == 3) & (data["Date"].dt.day == 1),:]

GB = data.loc[(data["Date"].dt.month == 3) & (data["Date"].dt.day == 18),:]

GR = data.loc[(data["Date"].dt.month == 25) & (data["Date"].dt.day == 2),:]

Berlin = data.loc[(data["Date"].dt.month == 3) & (data["Date"].dt.day == 8),:]

Thueringen = data.loc[(data["Date"].dt.month == 9) & (data["Date"].dt.day == 20),:]

SouthGermany = data.loc[(data["Date"].dt.month == 11) & (data["Date"].dt.day == 1),:]

NorthGermany = data.loc[(data["Date"].dt.month == 10) & (data["Date"].dt.day == 31),:]

1+2

3

ValueError: Cannot index with multidimensional key

0         1
1         1
2         1
3         1
4         1
5         1
6         1
7         1
8         1
9         1
10        1
11        1
12        1
13        1
14        1
15        1
16        1
17        1
18        1
19        1
20        1
21        1
22        1
23        1
24        1
25        1
26        1
27        1
28        1
29        1
         ..
864597    1
864598    1
864599    1
864600    1
864601    1
864602    1
864603    1
864604    1
864605    1
864606    1
864607    1
864608    1
864609    1
864610    1
864611    1
864612    1
864613    1
864614    0
864615    1
864616    1
864617    1
864618    1
864619    1
864620    1
864621    1
864622    1
864623    1
864624    0
864625    1
864626    1
Name: StateHoliday_0, Length: 864627, dtype: uint8