In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [3]:
dflojas = pd.read_csv("lojas.csv")
dftreino = pd.read_csv("dataset_treino.csv")
dfteste = pd.read_csv("dataset_teste.csv")

In [4]:
dftreinamento = pd.merge(dftreino, dflojas, on='Store', how='inner')
dfcompeticao = pd.merge(dfteste, dflojas, on='Store', how='inner')

In [5]:
def strToInt(x):
    if x == 0 or x == '0':
        return 0
    elif x == 'a':
        return 1
    elif x == 'b':
        return 2
    elif x == 'c':
        return 3
    else:
        return 4

dftreinamento['StateHoliday'] = dftreinamento['StateHoliday'].apply(lambda x: strToInt(x))
dftreinamento['StoreType'] = dftreinamento['StoreType'].apply(lambda x: strToInt(x))
dftreinamento['Assortment'] = dftreinamento['Assortment'].apply(lambda x: strToInt(x))

In [6]:
dftreinamento['Date'] = pd.DatetimeIndex(pd.to_datetime(dftreinamento['Date']), dtype='datetime64[ns]').astype(np.int64)

In [7]:
dftreinamento.groupby('PromoInterval').count()

Unnamed: 0_level_0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear
PromoInterval,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
"Feb,May,Aug,Nov",118596,118596,118596,118596,118596,118596,118596,118596,118596,118596,118596,117838,70488,70488,118596,118596,118596
"Jan,Apr,Jul,Oct",293122,293122,293122,293122,293122,293122,293122,293122,293122,293122,293122,293122,197664,197664,293122,293122,293122
"Mar,Jun,Sept,Dec",97460,97460,97460,97460,97460,97460,97460,97460,97460,97460,97460,97460,56174,56174,97460,97460,97460


In [8]:
def interval(x):
    if x == 'Feb,May,Aug,Nov':
        return 100
    elif x == 'Jan,Apr,Jul,Oct':
        return 101
    elif x == 'Mar,Jun,Sept,Dec':
        return 102
    else:
        return x

dftreinamento['PromoInterval'] = dftreinamento['PromoInterval'].apply(lambda x: interval(x))

In [9]:
dftreinamento['Promo2SinceWeek'] = dftreinamento['Promo2SinceWeek'].fillna(dftreinamento['Promo2SinceWeek'].mean())
dftreinamento['Promo2SinceYear'] = dftreinamento['Promo2SinceYear'].fillna(dftreinamento['Promo2SinceYear'].mean())
dftreinamento['PromoInterval'] = dftreinamento['PromoInterval'].fillna(dftreinamento['PromoInterval'].mean())
dftreinamento['CompetitionOpenSinceYear'] = dftreinamento['CompetitionOpenSinceYear'].fillna(dftreinamento['CompetitionOpenSinceYear'].mean())
dftreinamento['CompetitionOpenSinceMonth'] = dftreinamento['CompetitionOpenSinceMonth'].fillna(dftreinamento['CompetitionOpenSinceMonth'].mean())
dftreinamento['CompetitionDistance'] = dftreinamento['CompetitionDistance'].fillna(dftreinamento['CompetitionDistance'].mean())

In [10]:
dftreinamento.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,5,1438300800000000000,5263,555,1,1,0,1,3,1,1270.0,9.0,2008.0,0,23.269093,2011.752774,100.95849
1,1,4,1438214400000000000,5020,546,1,1,0,1,3,1,1270.0,9.0,2008.0,0,23.269093,2011.752774,100.95849
2,1,3,1438128000000000000,4782,523,1,1,0,1,3,1,1270.0,9.0,2008.0,0,23.269093,2011.752774,100.95849
3,1,2,1438041600000000000,5011,560,1,1,0,1,3,1,1270.0,9.0,2008.0,0,23.269093,2011.752774,100.95849
4,1,1,1437955200000000000,6102,612,1,1,0,1,3,1,1270.0,9.0,2008.0,0,23.269093,2011.752774,100.95849


In [11]:
dftreinamento.columns

Index(['Store', 'DayOfWeek', 'Date', 'Sales', 'Customers', 'Open', 'Promo',
       'StateHoliday', 'SchoolHoliday', 'StoreType', 'Assortment',
       'CompetitionDistance', 'CompetitionOpenSinceMonth',
       'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek',
       'Promo2SinceYear', 'PromoInterval'],
      dtype='object')

In [12]:
X = dftreinamento[['Store', 'DayOfWeek', 'Date', 'Customers', 'Open', 'Promo',
       'StateHoliday', 'SchoolHoliday', 'StoreType', 'Assortment',
       'CompetitionDistance', 'CompetitionOpenSinceMonth',
       'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek',
       'Promo2SinceYear', 'PromoInterval']]
y = dftreinamento['Sales']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [None]:
model = SVC()

In [None]:
param_grid = { 'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001]}
grid = GridSearchCV(SVC(), param_grid, verbose=3)
grid.fit(X_train, y_train)

Fitting 3 folds for each of 25 candidates, totalling 75 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] C=0.1, gamma=1 ..................................................


In [None]:
#model.fit(X_train, y_train)

In [None]:
#predictionCustomer = model.predict(X_test)

In [None]:
#print(confusion_matrix(y_test, predictionCustomer))
#print('\n')
#print(classification_report(y_test, predictionCustomer))