In [21]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import datasets, linear_model,preprocessing
from datetime import datetime
import gc
%matplotlib inline
from IPython.display import display, HTML
from pprint import pprint
import time
import nltk.corpus
from nltk.stem.snowball import SnowballStemmer
from ml_metrics import rmse,rmsle

In [6]:
#take 1 CSV, then split it to 3..
class FeatureEngineering:

    def __init__(self, ValidationStart, ValidationEnd, trainHdfPath, trainHdfFile, testHdfPath1, testHdfPath2, testHdfFile, 
                 testTypes, trainTypes, trainCsvPath, testCsvPath, maxLag=0):
        self.ValidationStart = ValidationStart
        self.ValidationEnd = ValidationEnd
        self.maxLag = maxLag
        self.trainHdfPath = trainHdfPath
        self.trainHdfFile = trainHdfFile
        self.testHdfPath1 = testHdfPath1
        self.testHdfPath2 = testHdfPath2
        self.testHdfFile = testHdfFile
        self.testTypes = testTypes
        self.trainTypes = trainTypes
        self.trainCsvPath = trainCsvPath
        self.testCsvPath = testCsvPath
        
    @staticmethod
    def __printDataFrameBasics__(data):
        display(data.head(2))
        #print data.dtypes
        gc.collect()
        print(data.info(memory_usage=True))
        
    @staticmethod    
    def changeIndexTypeToLowerMemory(data):
        ##########
        #This is very critical, i accept max number is 2^32. Also, if don't do that, memory gets so much higher..
        ##########
        #data.reset_index(inplace=True)
        #data.drop("index",axis=1, inplace=True)
        #data.index = data.index.astype('uint32')
        gc.collect()
        
    def ReadHdf(self, trainOrTestOrBoth):
        '''Reads and holds Df in object memory'''            
        if trainOrTestOrBoth == 'train' or trainOrTestOrBoth=='both':
            self.train = pd.read_hdf(self.trainHdfPath,self.trainHdfFile)
            FeatureEngineering.changeIndexTypeToLowerMemory(self.train)
            FeatureEngineering.__printDataFrameBasics__(self.train)
            
        if trainOrTestOrBoth == 'test' or trainOrTestOrBoth=='both':
            self.test1 = pd.read_hdf(self.testHdfPath1,self.testHdfFile)
            self.test2 = pd.read_hdf(self.testHdfPath2,self.testHdfFile)
            FeatureEngineering.changeIndexTypeToLowerMemory(self.test1)
            FeatureEngineering.changeIndexTypeToLowerMemory(self.test2)
            FeatureEngineering.__printDataFrameBasics__(self.test1)
            FeatureEngineering.__printDataFrameBasics__(self.test2)
        
    def ReadCsv(self, trainOrTestOrBoth):
        '''Reads and holds Df in memory'''
        if trainOrTestOrBoth == 'train' or trainOrTestOrBoth == 'both':
            self.train = pd.read_csv(self.trainCsvPath, usecols=self.trainTypes.keys(), dtype=self.trainTypes)
            FeatureEngineering.changeIndexTypeToLowerMemory(self.train)
            FeatureEngineering.__printDataFrameBasics__(self.train)
        if trainOrTestOrBoth == 'test' or trainOrTestOrBoth=='both':
            tempTest = pd.read_csv(self.testCsvPath, usecols=self.testTypes.keys(), dtype=self.testTypes)
            self.test1 = tempTest.loc[tempTest.Semana.values == self.ValidationStart]
            self.test2 = tempTest.loc[tempTest.Semana.values == self.ValidationEnd]
            del tempTest
            FeatureEngineering.changeIndexTypeToLowerMemory(self.test1)
            FeatureEngineering.changeIndexTypeToLowerMemory(self.test2)
            FeatureEngineering.__printDataFrameBasics__(self.test1)
            FeatureEngineering.__printDataFrameBasics__(self.test2)
            
    @staticmethod
    def ConvertCsvToHdf(csvPath, HdfPath, HdfName, ColumnTypeDict ):
        tempDf = pd.read_csv(csvPath, usecols=ColumnTypeDict.keys(), dtype=ColumnTypeDict,index=False)
        tempDf.to_hdf(HdfPath, HdfName, format='t')
        del tempDf
        gc.collect()
        print "ConvertCsvToHdf is done.."

    def Preprocess(self, trainOrTestOrBoth, columnFunctionTypeList):
        '''columnFunctionTypeList = [ ['C1',Func1,Type], ['C2',Func2,Type],..    ]'''
        for column, func, localType in columnFunctionTypeList:
            if trainOrTestOrBoth == 'train' or trainOrTestOrBoth =='both':
                self.train.loc[:,column] =  np.apply_along_axis(func,0,FE.train[column].values).astype(localType)
                #np.apply_along_axis(lambda x: x+1,0,FE.train["Semana"]).astype("int32")
            if trainOrTestOrBoth == 'test' or trainOrTestOrBoth == 'both':
                self.test1.loc[:,column] =  np.apply_along_axis(func,0,FE.test1[column].values).astype(localType)
                self.test2.loc[:,column] =  np.apply_along_axis(func,0,FE.test2[column].values).astype(localType)
        gc.collect()
        
    def SaveDataFrameToHdf(self,trainOrTestOrBoth):
        if trainOrTestOrBoth == 'train' or trainOrTestOrBoth=='both':
            self.train.to_hdf(self.trainHdfPath, self.trainHdfFile, format='t', index="False")
        if trainOrTestOrBoth == 'test' or trainOrTestOrBoth=='both':
            self.test1.to_hdf(self.testHdfPath1, self.testHdfFile, format='t', index="False")
            self.test2.to_hdf(self.testHdfPath2, self.testHdfFile, format='t', index="False")
        
    def AddDemandaGeneralMean(self,trainOrTestOrBoth): 
        #self.train.loc[:,"DemandaGeneralMean"] = self.train["Demanda_uni_equil"].loc[
         #   self.train.loc[:,'Semana'] < 10].mean().astype("float32")
            
        meanOfDemanda = self.train["Demanda_uni_equil"].values.mean().astype("float32")
        
        if trainOrTestOrBoth == 'train' or trainOrTestOrBoth=='both':
            self.train.loc[:,"DemandaGeneralMean"] = meanOfDemanda
        if trainOrTestOrBoth == 'test' or trainOrTestOrBoth=='both':
            self.test1.loc[:,"DemandaGeneralMean"] = meanOfDemanda
            self.test2.loc[:,"DemandaGeneralMean"] = meanOfDemanda
        
        #self.train.loc[:,"DemandaGeneralMean"] = self.train["Demanda_uni_equil"].values[
        #(self.train.loc[:,'Semana'].values < self.ValidationStart).values].mean().astype("float32")
        gc.collect()
        
    '''ConfigElements(0,[ ("A",["Semana","Agencia_ID"],["count","count"]),'''
    def AddConfigurableFeaturesToTrain(self, config):
        if config.lag > self.maxLag:
            self.maxLag = config.lag
        
        tempData = self.train[self.train['Semana'].values <= (self.ValidationEnd - config.lag)]
        #display(tempData)
        if(config.lag != 0):
            tempData.loc[:,'Semana'] = tempData['Semana'].values + config.lag
        #display(tempData)
        
        #Means iterative.. eliminate as long as np.nan exists..If there is already one, don't create, use the existing
        if config.targetVariable != "" and  config.targetVariable not in self.train.columns:
            self.train.loc[:,config.targetVariable] = np.nan
            self.test1.loc[:,config.targetVariable] = np.nan
            
            if config.lag != 1:
                self.test2.loc[:,config.targetVariable] = np.nan
        
        for name,groups,aggregate in config.nameAndGroups:
            if name not in self.train.columns:
                print "{} is not in columns..".format(name)            
                
                groupedDataframe = tempData[groups+['Demanda_uni_equil']].copy().groupby(groups).agg(aggregate[0])
                gc.collect()
                #groupedDataframe.columns = groupedDataframe.columns.droplevel(0)
                groupedDataframe.columns = [name]
                
                #This is means of the counts of the semana-columns tuples!..!!!
                #If no lag and mean, mean of the columns without semana!!..
                #If there is lag and count, count of the columns x weeks before
                #If there is lag and mean, mean of the columns x weeks before
                #if(config.lag == 0 and aggregate == "count"):
                if(len(aggregate)>1):
                    groupedDataframe.reset_index(inplace=True)
                    groupedDataframe.drop("Semana",axis=1, inplace=True)
                    groups = groups[1:]
                    groupedDataframe = groupedDataframe.groupby(groups).agg(aggregate[1])
                    groupedDataframe.columns = [name]
                    gc.collect()
                
                display(groupedDataframe.head(2))
                self.train = self.train.merge( groupedDataframe, left_on=groups,
                    right_index=True, how='left', sort=False,copy=False)
                gc.collect()
                self.test1 = self.test1.merge( groupedDataframe, left_on=groups,
                    right_index=True, how='left', sort=False,copy=False)
                gc.collect()
                if config.lag != 1:
                    self.test2 = self.test2.merge( groupedDataframe, left_on=groups,
                        right_index=True, how='left', sort=False,copy=False)
                
                del groupedDataframe
                gc.collect()
            else:
                print "{} is in columns..".format(name)
            
            display(self.train.head(2))
            display(self.test1.head(2))
            display(self.test2.head(2))
            
            #Means iterative..!!!!!
            if config.targetVariable != "":
                self.train.loc[pd.isnull(self.train[config.targetVariable].values), 
                    config.targetVariable] = self.train.loc[pd.isnull(self.train[config.targetVariable].values)
                    , name].values
                self.test1.loc[pd.isnull(self.test1[config.targetVariable].values), 
                    config.targetVariable] = self.test1.loc[pd.isnull(self.test1[config.targetVariable].values),
                    name].values
                if config.lag != 1:
                    self.test2.loc[pd.isnull(self.test2[config.targetVariable].values), 
                        config.targetVariable] = self.test2.loc[pd.isnull(self.test2[config.targetVariable].values)
                        , name].values
                    
                count = self.test1[config.targetVariable].isnull().sum()
                print "Count of missing numbers after {} in validation part 1 in column {} is {}".format(name, 
                    config.targetVariable,str(count))
                if config.lag != 1:
                    count = self.test2.loc[:,config.targetVariable].isnull().sum()
                    print "Count of missing numbers after {} in validation part 2 in column {} is {}".format(name, 
                        config.targetVariable,str(count))
                
                
                #display(self.train)
                #If column is already in Dataframe and we want to fill target variable, this deletes columns!!!
                if(config.deleteColumns):
                    self.train.drop(name, axis=1, inplace=True)
                    self.test1.drop(name, axis=1, inplace=True)
                    if config.lag != 1:
                        self.test2.drop(name, axis=1, inplace=True)
                gc.collect()
                #Only in tesst
                #if count == 0:
                 #   break
        del tempData
        display(self.train.head(2))
        display(self.test1.head(2))
        display(self.test2.head(2))
        gc.collect()
        return 
    
    def DeleteLaggedWeeksFromTrain(self):
        self.train = self.train[self.train['Semana'].values >= (3 + self.maxLag)]
        gc.collect()
        display(self.train.head(2))
        
    def ReadFirstNRowsOfACsv(self, nrows, trainOrTestOrBoth) :
        if trainOrTestOrBoth == 'train' or trainOrTestOrBoth=='both':
            self.train = pd.read_csv(self.trainCsvPath, usecols=self.trainTypes.keys(), dtype=self.trainTypes, nrows = nrows)
            FeatureEngineering.changeIndexTypeToLowerMemory(self.train)
            FeatureEngineering.__printDataFrameBasics__(self.train)
        if trainOrTestOrBoth == 'test' or trainOrTestOrBoth=='both':
            tempTest = pd.read_csv(self.testCsvPath, usecols=self.testTypes.keys(), dtype=self.testTypes, nrows = nrows*2)
            self.test1 = tempTest.loc[tempTest.Semana == self.ValidationStart]
            self.test2 = tempTest.loc[tempTest.Semana == self.ValidationEnd]
            del tempTest
            FeatureEngineering.changeIndexTypeToLowerMemory(self.test1)
            FeatureEngineering.changeIndexTypeToLowerMemory(self.test2)
            FeatureEngineering.__printDataFrameBasics__(self.test1)
            FeatureEngineering.__printDataFrameBasics__(self.test2)
    
    #Use when concatanating train and validation before predict test for example..
    def AppendTestToTrain(self,deleteTest = True):
        self.train = self.train.append(self.test1,ignore_index=True)
        gc.collect()
        if(deleteTest):
            del self.test1
            gc.collect()
        try:
            self.train = self.train.append(self.test2,ignore_index=True)
            gc.collect()
            if(deleteTest):
                del self.test2
                gc.collect()
        except:
            pass
        #BAD PERFORMANCE!!
    #Split train data to train and test1 and test2 (validation)
    #def SplitTrainToTestUsingValidationStart(self):
     #   boolCondition = self.train.Semana == self.ValidationStart
      #  self.test1 = self.train.loc[boolCondition]
       # self.train.drop((self.train.loc[boolCondition].index), axis=0,inplace=True)
        
       # boolCondition = self.train.Semana == self.ValidationEnd
       # self.test2 = self.train.loc[boolCondition]
       # self.train.drop((self.train.loc[boolCondition].index), axis=0,inplace=True)
      #  del boolCondition
      #  gc.collect()
    
    #Reaches 3x memory from train, because of test1, test2 and train itself at the end.. GC fixed in the end..
    def SplitTrainToTestUsingValidationStart(self):
        boolCondition = self.train.Semana.values == self.ValidationStart
        self.test1 = self.train[boolCondition]
        boolCondition = self.train.Semana.values == self.ValidationEnd
        self.test2 = self.train[boolCondition]
        FE.train = FE.train[ FE.train.Semana.values < FE.ValidationStart ]
        del boolCondition
        gc.collect()

In [7]:
parameterDict =       {"ValidationStart":8, 
 "ValidationEnd":9,
   "maxLag":3,
    "trainHdfPath":'../../input/train.h5',
    "trainHdfFile":"train",
    "testHdfPath1":"../../input/test1.h5",
    "testHdfPath2":"../../input/test2.h5",
    "testHdfFile":"test", 
    "trainTypes" : {'Semana':np.uint8, 'Agencia_ID':np.uint16, 'Canal_ID':np.uint8,'Ruta_SAK':np.uint16, 
        'Cliente_ID':np.uint32, 'Producto_ID':np.uint16,'Venta_uni_hoy':np.uint16, 'Venta_hoy':np.float32,
                    'Dev_uni_proxima': np.uint32, 'Dev_proxima':np.float32,'Demanda_uni_equil':np.uint32}, 
    "testTypes" : {'id':np.uint32,'Semana':np.uint8, 'Agencia_ID':np.uint16, 'Canal_ID':np.uint8,'Ruta_SAK':np.uint16,
        'Cliente_ID':np.uint32, 'Producto_ID':np.uint16},
    "trainCsvPath":'../../input/train.csv'   ,
    "testCsvPath":'../../input/test.csv'}

FE = FeatureEngineering(**parameterDict)
print FE.__dict__

{'trainCsvPath': '../../input/train.csv', 'maxLag': 3, 'testTypes': {'Cliente_ID': <type 'numpy.uint32'>, 'Ruta_SAK': <type 'numpy.uint16'>, 'Canal_ID': <type 'numpy.uint8'>, 'Producto_ID': <type 'numpy.uint16'>, 'Agencia_ID': <type 'numpy.uint16'>, 'Semana': <type 'numpy.uint8'>, 'id': <type 'numpy.uint32'>}, 'testHdfFile': 'test', 'trainTypes': {'Dev_proxima': <type 'numpy.float32'>, 'Venta_uni_hoy': <type 'numpy.uint16'>, 'Cliente_ID': <type 'numpy.uint32'>, 'Demanda_uni_equil': <type 'numpy.uint32'>, 'Ruta_SAK': <type 'numpy.uint16'>, 'Canal_ID': <type 'numpy.uint8'>, 'Venta_hoy': <type 'numpy.float32'>, 'Producto_ID': <type 'numpy.uint16'>, 'Agencia_ID': <type 'numpy.uint16'>, 'Dev_uni_proxima': <type 'numpy.uint32'>, 'Semana': <type 'numpy.uint8'>}, 'testHdfPath1': '../../input/test1.h5', 'ValidationEnd': 9, 'testHdfPath2': '../../input/test2.h5', 'testCsvPath': '../../input/test.csv', 'ValidationStart': 8, 'trainHdfFile': 'train', 'trainHdfPath': '../../input/train.h5'}


In [8]:
FE.ReadHdf('both')

Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID
0,3,1110,7,3301,15766,1212,3,25.139999,0,0.0,3,False,120,2,709,4,2008,16
1,3,1110,7,3301,15766,1216,4,33.52,0,0.0,4,False,135,2,712,4,2008,16


<class 'pandas.core.frame.DataFrame'>
Int64Index: 52449630 entries, 0 to 52449629
Data columns (total 18 columns):
Semana                                          uint8
Agencia_ID                                      uint16
Canal_ID                                        uint8
Ruta_SAK                                        uint16
Cliente_ID                                      uint32
Producto_ID                                     uint16
Venta_uni_hoy                                   uint16
Venta_hoy                                       float32
Dev_uni_proxima                                 uint32
Dev_proxima                                     float32
Demanda_uni_equil                               uint32
DemandaNotEqualTheDifferenceOfVentaUniAndDev    bool
weight                                          uint16
pieces                                          uint8
Prod_name_ID                                    uint16
Brand_ID                                        uint8
Town_ID  

Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID
52449630,8,1110,7,3301,15766,1212,4,33.52,0,0.0,4,False,120,2,709,4,2008,16
52449631,8,1110,7,3301,15766,1216,5,41.900002,0,0.0,5,False,135,2,712,4,2008,16


<class 'pandas.core.frame.DataFrame'>
Int64Index: 10193576 entries, 52449630 to 62643205
Data columns (total 18 columns):
Semana                                          uint8
Agencia_ID                                      uint16
Canal_ID                                        uint8
Ruta_SAK                                        uint16
Cliente_ID                                      uint32
Producto_ID                                     uint16
Venta_uni_hoy                                   uint16
Venta_hoy                                       float32
Dev_uni_proxima                                 uint32
Dev_proxima                                     float32
Demanda_uni_equil                               uint32
DemandaNotEqualTheDifferenceOfVentaUniAndDev    bool
weight                                          uint16
pieces                                          uint8
Prod_name_ID                                    uint16
Brand_ID                                        uint8
To

Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID
62643206,9,1110,7,3301,15766,1212,1,8.38,0,0.0,1,False,120,2,709,4,2008,16
62643207,9,1110,7,3301,15766,1238,2,19.66,0,0.0,2,False,140,2,630,4,2008,16


<class 'pandas.core.frame.DataFrame'>
Int64Index: 10200437 entries, 62643206 to 72843642
Data columns (total 18 columns):
Semana                                          uint8
Agencia_ID                                      uint16
Canal_ID                                        uint8
Ruta_SAK                                        uint16
Cliente_ID                                      uint32
Producto_ID                                     uint16
Venta_uni_hoy                                   uint16
Venta_hoy                                       float32
Dev_uni_proxima                                 uint32
Dev_proxima                                     float32
Demanda_uni_equil                               uint32
DemandaNotEqualTheDifferenceOfVentaUniAndDev    bool
weight                                          uint16
pieces                                          uint8
Prod_name_ID                                    uint16
Brand_ID                                        uint8
To

## Convert to Log..

In [9]:
FE.train.loc[:,"Demanda_uni_equil"] = np.log1p (FE.train["Demanda_uni_equil"].values)

In [10]:
FE.train.head(2)

Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID
0,3,1110,7,3301,15766,1212,3,25.139999,0,0.0,1.386294,False,120,2,709,4,2008,16
1,3,1110,7,3301,15766,1216,4,33.52,0,0.0,1.609438,False,135,2,712,4,2008,16


## Grouping for Lag0


In [12]:
class ConfigElements:
    def __init__(self, lag, nameAndGroups, targetVariable="", deleteColumns = False):
        self.lag = lag
        self.nameAndGroups = nameAndGroups
        #If there is target variable, then 5 4 3 2 1, fill the np.nans..Else hold them all in Dataframe..
        self.targetVariable = targetVariable
        self.deleteColumns = deleteColumns

In [13]:
configLag0Target1DeleteColumnsFalse = ConfigElements(0,[ ("SPClRACh0_mean_sum",["Semana",
                                                              "Producto_ID",
                                                              "Cliente_ID",
                                                              "Ruta_SAK",
                                                              "Agencia_ID",
                                                              "Canal_ID"],
                                                          ["mean","sum"]),
                                                        ("SPClRA0_mean_sum",["Semana",
                                                              "Producto_ID",
                                                              "Cliente_ID",
                                                              "Ruta_SAK",
                                                              "Agencia_ID"],
                                                          ["mean","sum"]),
                                                        ("SPClRCh0_mean_sum",["Semana",
                                                              "Producto_ID",
                                                              "Cliente_ID",
                                                              "Ruta_SAK",
                                                              "Canal_ID"],
                                                          ["mean","sum"]),
                                                        ("SPClACh0_mean_sum",["Semana",
                                                              "Producto_ID",
                                                              "Cliente_ID",
                                                              "Agencia_ID",
                                                              "Canal_ID"],
                                                          ["mean","sum"]),
                                                        ("SPClR0_mean_sum",["Semana",
                                                              "Producto_ID",
                                                              "Cliente_ID",
                                                              "Ruta_SAK"],
                                                          ["mean","sum"]),
                                                        ("SPClA0_mean_sum",["Semana",
                                                              "Producto_ID",
                                                              "Cliente_ID",
                                                              "Agencia_ID"],
                                                          ["mean","sum"]),
                                                       ("SPClCh0_mean_sum",["Semana",
                                                              "Producto_ID",
                                                              "Cliente_ID",
                                                              "Canal_ID"],
                                                          ["mean","sum"]),
                                                       ("SPCl0_mean_sum",["Semana",
                                                              "Producto_ID",
                                                              "Cliente_ID"],
                                                          ["mean","sum"]),
                                                       ("SPR0_mean_sum",["Semana",
                                                              "Producto_ID",
                                                              "Ruta_SAK"],
                                                          ["mean","sum"]),
                                                       ("SPA0_mean_sum",["Semana",
                                                              "Producto_ID",
                                                              "Agencia_ID"],
                                                          ["mean","sum"]),
                                                       ("SPCh0_mean_sum",["Semana",
                                                              "Producto_ID",
                                                              "Canal_ID"],
                                                          ["mean","sum"]),
                                                       ("SPT0_mean_sum",["Semana",
                                                              "Producto_ID",
                                                              "Town_ID"],
                                                          ["mean","sum"]),
                                                       ("SPSt0_mean_sum",["Semana",
                                                              "Producto_ID",
                                                              "State_ID"],
                                                          ["mean","sum"]),
                                                       ("SP0_mean_sum",["Semana",
                                                              "Producto_ID"],
                                                          ["mean","sum"])
                                                       
                                                       
                                                       ], "Lag0", True)
print  configLag0Target1DeleteColumnsFalse.__dict__

{'nameAndGroups': [('SPClRACh0_mean_sum', ['Semana', 'Producto_ID', 'Cliente_ID', 'Ruta_SAK', 'Agencia_ID', 'Canal_ID'], ['mean', 'sum']), ('SPClRA0_mean_sum', ['Semana', 'Producto_ID', 'Cliente_ID', 'Ruta_SAK', 'Agencia_ID'], ['mean', 'sum']), ('SPClRCh0_mean_sum', ['Semana', 'Producto_ID', 'Cliente_ID', 'Ruta_SAK', 'Canal_ID'], ['mean', 'sum']), ('SPClACh0_mean_sum', ['Semana', 'Producto_ID', 'Cliente_ID', 'Agencia_ID', 'Canal_ID'], ['mean', 'sum']), ('SPClR0_mean_sum', ['Semana', 'Producto_ID', 'Cliente_ID', 'Ruta_SAK'], ['mean', 'sum']), ('SPClA0_mean_sum', ['Semana', 'Producto_ID', 'Cliente_ID', 'Agencia_ID'], ['mean', 'sum']), ('SPClCh0_mean_sum', ['Semana', 'Producto_ID', 'Cliente_ID', 'Canal_ID'], ['mean', 'sum']), ('SPCl0_mean_sum', ['Semana', 'Producto_ID', 'Cliente_ID'], ['mean', 'sum']), ('SPR0_mean_sum', ['Semana', 'Producto_ID', 'Ruta_SAK'], ['mean', 'sum']), ('SPA0_mean_sum', ['Semana', 'Producto_ID', 'Agencia_ID'], ['mean', 'sum']), ('SPCh0_mean_sum', ['Semana', 'Produc

In [14]:
FE.train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 52449630 entries, 0 to 52449629
Data columns (total 18 columns):
Semana                                          uint8
Agencia_ID                                      uint16
Canal_ID                                        uint8
Ruta_SAK                                        uint16
Cliente_ID                                      uint32
Producto_ID                                     uint16
Venta_uni_hoy                                   uint16
Venta_hoy                                       float32
Dev_uni_proxima                                 uint32
Dev_proxima                                     float32
Demanda_uni_equil                               float64
DemandaNotEqualTheDifferenceOfVentaUniAndDev    bool
weight                                          uint16
pieces                                          uint8
Prod_name_ID                                    uint16
Brand_ID                                        uint8
Town_ID 

In [15]:
FE.AddConfigurableFeaturesToTrain(configLag0Target1DeleteColumnsFalse)

SPClRACh0_mean_sum is not in columns..


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,SPClRACh0_mean_sum
Producto_ID,Cliente_ID,Ruta_SAK,Agencia_ID,Canal_ID,Unnamed: 5_level_1
41,146030,3303,2281,7,8.373554
41,681747,3306,2281,7,36.820888


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID,Lag0,SPClRACh0_mean_sum
0,3,1110,7,3301,15766,1212,3,25.139999,0,0.0,1.386294,False,120,2,709,4,2008,16,,4.787492
1,3,1110,7,3301,15766,1216,4,33.52,0,0.0,1.609438,False,135,2,712,4,2008,16,,5.886104


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID,Lag0,SPClRACh0_mean_sum
52449630,8,1110,7,3301,15766,1212,4,33.52,0,0.0,4,False,120,2,709,4,2008,16,,4.787492
52449631,8,1110,7,3301,15766,1216,5,41.900002,0,0.0,5,False,135,2,712,4,2008,16,,5.886104


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID,Lag0,SPClRACh0_mean_sum
62643206,9,1110,7,3301,15766,1212,1,8.38,0,0.0,1,False,120,2,709,4,2008,16,,4.787492
62643207,9,1110,7,3301,15766,1238,2,19.66,0,0.0,2,False,140,2,630,4,2008,16,,5.886104


Count of missing numbers after SPClRACh0_mean_sum in validation part 1 in column Lag0 is 2008765
Count of missing numbers after SPClRACh0_mean_sum in validation part 2 in column Lag0 is 2728164
SPClRA0_mean_sum is not in columns..


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,SPClRA0_mean_sum
Producto_ID,Cliente_ID,Ruta_SAK,Agencia_ID,Unnamed: 4_level_1
41,146030,3303,2281,8.373554
41,681747,3306,2281,36.820888


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID,Lag0,SPClRA0_mean_sum
0,3,1110,7,3301,15766,1212,3,25.139999,0,0.0,1.386294,False,120,2,709,4,2008,16,4.787492,4.787492
1,3,1110,7,3301,15766,1216,4,33.52,0,0.0,1.609438,False,135,2,712,4,2008,16,5.886104,5.886104


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID,Lag0,SPClRA0_mean_sum
52449630,8,1110,7,3301,15766,1212,4,33.52,0,0.0,4,False,120,2,709,4,2008,16,4.787492,4.787492
52449631,8,1110,7,3301,15766,1216,5,41.900002,0,0.0,5,False,135,2,712,4,2008,16,5.886104,5.886104


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID,Lag0,SPClRA0_mean_sum
62643206,9,1110,7,3301,15766,1212,1,8.38,0,0.0,1,False,120,2,709,4,2008,16,4.787492,4.787492
62643207,9,1110,7,3301,15766,1238,2,19.66,0,0.0,2,False,140,2,630,4,2008,16,5.886104,5.886104


Count of missing numbers after SPClRA0_mean_sum in validation part 1 in column Lag0 is 2008400
Count of missing numbers after SPClRA0_mean_sum in validation part 2 in column Lag0 is 2727510
SPClRCh0_mean_sum is not in columns..


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,SPClRCh0_mean_sum
Producto_ID,Cliente_ID,Ruta_SAK,Canal_ID,Unnamed: 4_level_1
41,146030,3303,7,8.373554
41,681747,3306,7,36.820888


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID,Lag0,SPClRCh0_mean_sum
0,3,1110,7,3301,15766,1212,3,25.139999,0,0.0,1.386294,False,120,2,709,4,2008,16,4.787492,4.787492
1,3,1110,7,3301,15766,1216,4,33.52,0,0.0,1.609438,False,135,2,712,4,2008,16,5.886104,5.886104


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID,Lag0,SPClRCh0_mean_sum
52449630,8,1110,7,3301,15766,1212,4,33.52,0,0.0,4,False,120,2,709,4,2008,16,4.787492,4.787492
52449631,8,1110,7,3301,15766,1216,5,41.900002,0,0.0,5,False,135,2,712,4,2008,16,5.886104,5.886104


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID,Lag0,SPClRCh0_mean_sum
62643206,9,1110,7,3301,15766,1212,1,8.38,0,0.0,1,False,120,2,709,4,2008,16,4.787492,4.787492
62643207,9,1110,7,3301,15766,1238,2,19.66,0,0.0,2,False,140,2,630,4,2008,16,5.886104,5.886104


Count of missing numbers after SPClRCh0_mean_sum in validation part 1 in column Lag0 is 2007825
Count of missing numbers after SPClRCh0_mean_sum in validation part 2 in column Lag0 is 2726789
SPClACh0_mean_sum is not in columns..


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,SPClACh0_mean_sum
Producto_ID,Cliente_ID,Agencia_ID,Canal_ID,Unnamed: 4_level_1
41,146030,2281,7,8.373554
41,681747,2281,7,36.820888


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID,Lag0,SPClACh0_mean_sum
0,3,1110,7,3301,15766,1212,3,25.139999,0,0.0,1.386294,False,120,2,709,4,2008,16,4.787492,4.787492
1,3,1110,7,3301,15766,1216,4,33.52,0,0.0,1.609438,False,135,2,712,4,2008,16,5.886104,5.886104


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID,Lag0,SPClACh0_mean_sum
52449630,8,1110,7,3301,15766,1212,4,33.52,0,0.0,4,False,120,2,709,4,2008,16,4.787492,4.787492
52449631,8,1110,7,3301,15766,1216,5,41.900002,0,0.0,5,False,135,2,712,4,2008,16,5.886104,5.886104


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID,Lag0,SPClACh0_mean_sum
62643206,9,1110,7,3301,15766,1212,1,8.38,0,0.0,1,False,120,2,709,4,2008,16,4.787492,4.787492
62643207,9,1110,7,3301,15766,1238,2,19.66,0,0.0,2,False,140,2,630,4,2008,16,5.886104,5.886104


Count of missing numbers after SPClACh0_mean_sum in validation part 1 in column Lag0 is 1914973
Count of missing numbers after SPClACh0_mean_sum in validation part 2 in column Lag0 is 2576300
SPClR0_mean_sum is not in columns..


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,SPClR0_mean_sum
Producto_ID,Cliente_ID,Ruta_SAK,Unnamed: 3_level_1
41,146030,3303,8.373554
41,681747,3306,36.820888


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID,Lag0,SPClR0_mean_sum
0,3,1110,7,3301,15766,1212,3,25.139999,0,0.0,1.386294,False,120,2,709,4,2008,16,4.787492,4.787492
1,3,1110,7,3301,15766,1216,4,33.52,0,0.0,1.609438,False,135,2,712,4,2008,16,5.886104,5.886104


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID,Lag0,SPClR0_mean_sum
52449630,8,1110,7,3301,15766,1212,4,33.52,0,0.0,4,False,120,2,709,4,2008,16,4.787492,4.787492
52449631,8,1110,7,3301,15766,1216,5,41.900002,0,0.0,5,False,135,2,712,4,2008,16,5.886104,5.886104


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID,Lag0,SPClR0_mean_sum
62643206,9,1110,7,3301,15766,1212,1,8.38,0,0.0,1,False,120,2,709,4,2008,16,4.787492,4.787492
62643207,9,1110,7,3301,15766,1238,2,19.66,0,0.0,2,False,140,2,630,4,2008,16,5.886104,5.886104


Count of missing numbers after SPClR0_mean_sum in validation part 1 in column Lag0 is 1914973
Count of missing numbers after SPClR0_mean_sum in validation part 2 in column Lag0 is 2576300
SPClA0_mean_sum is not in columns..


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,SPClA0_mean_sum
Producto_ID,Cliente_ID,Agencia_ID,Unnamed: 3_level_1
41,146030,2281,8.373554
41,681747,2281,36.820888


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID,Lag0,SPClA0_mean_sum
0,3,1110,7,3301,15766,1212,3,25.139999,0,0.0,1.386294,False,120,2,709,4,2008,16,4.787492,4.787492
1,3,1110,7,3301,15766,1216,4,33.52,0,0.0,1.609438,False,135,2,712,4,2008,16,5.886104,5.886104


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID,Lag0,SPClA0_mean_sum
52449630,8,1110,7,3301,15766,1212,4,33.52,0,0.0,4,False,120,2,709,4,2008,16,4.787492,4.787492
52449631,8,1110,7,3301,15766,1216,5,41.900002,0,0.0,5,False,135,2,712,4,2008,16,5.886104,5.886104


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID,Lag0,SPClA0_mean_sum
62643206,9,1110,7,3301,15766,1212,1,8.38,0,0.0,1,False,120,2,709,4,2008,16,4.787492,4.787492
62643207,9,1110,7,3301,15766,1238,2,19.66,0,0.0,2,False,140,2,630,4,2008,16,5.886104,5.886104


Count of missing numbers after SPClA0_mean_sum in validation part 1 in column Lag0 is 1914699
Count of missing numbers after SPClA0_mean_sum in validation part 2 in column Lag0 is 2571322
SPClCh0_mean_sum is not in columns..


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,SPClCh0_mean_sum
Producto_ID,Cliente_ID,Canal_ID,Unnamed: 3_level_1
41,146030,7,8.373554
41,681747,7,36.820888


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID,Lag0,SPClCh0_mean_sum
0,3,1110,7,3301,15766,1212,3,25.139999,0,0.0,1.386294,False,120,2,709,4,2008,16,4.787492,4.787492
1,3,1110,7,3301,15766,1216,4,33.52,0,0.0,1.609438,False,135,2,712,4,2008,16,5.886104,5.886104


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID,Lag0,SPClCh0_mean_sum
52449630,8,1110,7,3301,15766,1212,4,33.52,0,0.0,4,False,120,2,709,4,2008,16,4.787492,4.787492
52449631,8,1110,7,3301,15766,1216,5,41.900002,0,0.0,5,False,135,2,712,4,2008,16,5.886104,5.886104


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID,Lag0,SPClCh0_mean_sum
62643206,9,1110,7,3301,15766,1212,1,8.38,0,0.0,1,False,120,2,709,4,2008,16,4.787492,4.787492
62643207,9,1110,7,3301,15766,1238,2,19.66,0,0.0,2,False,140,2,630,4,2008,16,5.886104,5.886104


Count of missing numbers after SPClCh0_mean_sum in validation part 1 in column Lag0 is 1913356
Count of missing numbers after SPClCh0_mean_sum in validation part 2 in column Lag0 is 2569564
SPCl0_mean_sum is not in columns..


Unnamed: 0_level_0,Unnamed: 1_level_0,SPCl0_mean_sum
Producto_ID,Cliente_ID,Unnamed: 2_level_1
41,146030,8.373554
41,681747,36.820888


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID,Lag0,SPCl0_mean_sum
0,3,1110,7,3301,15766,1212,3,25.139999,0,0.0,1.386294,False,120,2,709,4,2008,16,4.787492,4.787492
1,3,1110,7,3301,15766,1216,4,33.52,0,0.0,1.609438,False,135,2,712,4,2008,16,5.886104,5.886104


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID,Lag0,SPCl0_mean_sum
52449630,8,1110,7,3301,15766,1212,4,33.52,0,0.0,4,False,120,2,709,4,2008,16,4.787492,4.787492
52449631,8,1110,7,3301,15766,1216,5,41.900002,0,0.0,5,False,135,2,712,4,2008,16,5.886104,5.886104


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID,Lag0,SPCl0_mean_sum
62643206,9,1110,7,3301,15766,1212,1,8.38,0,0.0,1,False,120,2,709,4,2008,16,4.787492,4.787492
62643207,9,1110,7,3301,15766,1238,2,19.66,0,0.0,2,False,140,2,630,4,2008,16,5.886104,5.886104


Count of missing numbers after SPCl0_mean_sum in validation part 1 in column Lag0 is 1912970
Count of missing numbers after SPCl0_mean_sum in validation part 2 in column Lag0 is 2568823
SPR0_mean_sum is not in columns..


Unnamed: 0_level_0,Unnamed: 1_level_0,SPR0_mean_sum
Producto_ID,Ruta_SAK,Unnamed: 2_level_1
41,3201,11.447204
41,3301,5.118889


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID,Lag0,SPR0_mean_sum
0,3,1110,7,3301,15766,1212,3,25.139999,0,0.0,1.386294,False,120,2,709,4,2008,16,4.787492,7.892148
1,3,1110,7,3301,15766,1216,4,33.52,0,0.0,1.609438,False,135,2,712,4,2008,16,5.886104,8.386266


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID,Lag0,SPR0_mean_sum
52449630,8,1110,7,3301,15766,1212,4,33.52,0,0.0,4,False,120,2,709,4,2008,16,4.787492,7.892148
52449631,8,1110,7,3301,15766,1216,5,41.900002,0,0.0,5,False,135,2,712,4,2008,16,5.886104,8.386266


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID,Lag0,SPR0_mean_sum
62643206,9,1110,7,3301,15766,1212,1,8.38,0,0.0,1,False,120,2,709,4,2008,16,4.787492,7.892148
62643207,9,1110,7,3301,15766,1238,2,19.66,0,0.0,2,False,140,2,630,4,2008,16,5.886104,7.789902


Count of missing numbers after SPR0_mean_sum in validation part 1 in column Lag0 is 37226
Count of missing numbers after SPR0_mean_sum in validation part 2 in column Lag0 is 98314
SPA0_mean_sum is not in columns..


Unnamed: 0_level_0,Unnamed: 1_level_0,SPA0_mean_sum
Producto_ID,Agencia_ID,Unnamed: 2_level_1
41,1958,10.344834
41,2278,5.118889


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID,Lag0,SPA0_mean_sum
0,3,1110,7,3301,15766,1212,3,25.139999,0,0.0,1.386294,False,120,2,709,4,2008,16,4.787492,8.018035
1,3,1110,7,3301,15766,1216,4,33.52,0,0.0,1.609438,False,135,2,712,4,2008,16,5.886104,7.15314


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID,Lag0,SPA0_mean_sum
52449630,8,1110,7,3301,15766,1212,4,33.52,0,0.0,4,False,120,2,709,4,2008,16,4.787492,8.018035
52449631,8,1110,7,3301,15766,1216,5,41.900002,0,0.0,5,False,135,2,712,4,2008,16,5.886104,7.15314


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID,Lag0,SPA0_mean_sum
62643206,9,1110,7,3301,15766,1212,1,8.38,0,0.0,1,False,120,2,709,4,2008,16,4.787492,8.018035
62643207,9,1110,7,3301,15766,1238,2,19.66,0,0.0,2,False,140,2,630,4,2008,16,5.886104,8.490976


Count of missing numbers after SPA0_mean_sum in validation part 1 in column Lag0 is 21985
Count of missing numbers after SPA0_mean_sum in validation part 2 in column Lag0 is 73361
SPCh0_mean_sum is not in columns..


Unnamed: 0_level_0,Unnamed: 1_level_0,SPCh0_mean_sum
Producto_ID,Canal_ID,Unnamed: 2_level_1
41,7,24.452719
53,4,27.663352


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID,Lag0,SPCh0_mean_sum
0,3,1110,7,3301,15766,1212,3,25.139999,0,0.0,1.386294,False,120,2,709,4,2008,16,4.787492,7.671042
1,3,1110,7,3301,15766,1216,4,33.52,0,0.0,1.609438,False,135,2,712,4,2008,16,5.886104,7.748825


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID,Lag0,SPCh0_mean_sum
52449630,8,1110,7,3301,15766,1212,4,33.52,0,0.0,4,False,120,2,709,4,2008,16,4.787492,7.671042
52449631,8,1110,7,3301,15766,1216,5,41.900002,0,0.0,5,False,135,2,712,4,2008,16,5.886104,7.748825


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID,Lag0,SPCh0_mean_sum
62643206,9,1110,7,3301,15766,1212,1,8.38,0,0.0,1,False,120,2,709,4,2008,16,4.787492,7.671042
62643207,9,1110,7,3301,15766,1238,2,19.66,0,0.0,2,False,140,2,630,4,2008,16,5.886104,8.1201


Count of missing numbers after SPCh0_mean_sum in validation part 1 in column Lag0 is 1088
Count of missing numbers after SPCh0_mean_sum in validation part 2 in column Lag0 is 8203
SPT0_mean_sum is not in columns..


Unnamed: 0_level_0,Unnamed: 1_level_0,SPT0_mean_sum
Producto_ID,Town_ID,Unnamed: 2_level_1
41,2288,10.344834
41,2381,28.224045


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID,Lag0,SPT0_mean_sum
0,3,1110,7,3301,15766,1212,3,25.139999,0,0.0,1.386294,False,120,2,709,4,2008,16,4.787492,8.691791
1,3,1110,7,3301,15766,1216,4,33.52,0,0.0,1.609438,False,135,2,712,4,2008,16,5.886104,7.762298


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID,Lag0,SPT0_mean_sum
52449630,8,1110,7,3301,15766,1212,4,33.52,0,0.0,4,False,120,2,709,4,2008,16,4.787492,8.691791
52449631,8,1110,7,3301,15766,1216,5,41.900002,0,0.0,5,False,135,2,712,4,2008,16,5.886104,7.762298


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID,Lag0,SPT0_mean_sum
62643206,9,1110,7,3301,15766,1212,1,8.38,0,0.0,1,False,120,2,709,4,2008,16,4.787492,8.691791
62643207,9,1110,7,3301,15766,1238,2,19.66,0,0.0,2,False,140,2,630,4,2008,16,5.886104,9.037291


Count of missing numbers after SPT0_mean_sum in validation part 1 in column Lag0 is 1046
Count of missing numbers after SPT0_mean_sum in validation part 2 in column Lag0 is 8077
SPSt0_mean_sum is not in columns..


Unnamed: 0_level_0,Unnamed: 1_level_0,SPSt0_mean_sum
Producto_ID,State_ID,Unnamed: 2_level_1
41,22,25.01424
41,25,10.344834


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID,Lag0,SPSt0_mean_sum
0,3,1110,7,3301,15766,1212,3,25.139999,0,0.0,1.386294,False,120,2,709,4,2008,16,4.787492,7.079092
1,3,1110,7,3301,15766,1216,4,33.52,0,0.0,1.609438,False,135,2,712,4,2008,16,5.886104,6.439475


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID,Lag0,SPSt0_mean_sum
52449630,8,1110,7,3301,15766,1212,4,33.52,0,0.0,4,False,120,2,709,4,2008,16,4.787492,7.079092
52449631,8,1110,7,3301,15766,1216,5,41.900002,0,0.0,5,False,135,2,712,4,2008,16,5.886104,6.439475


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID,Lag0,SPSt0_mean_sum
62643206,9,1110,7,3301,15766,1212,1,8.38,0,0.0,1,False,120,2,709,4,2008,16,4.787492,7.079092
62643207,9,1110,7,3301,15766,1238,2,19.66,0,0.0,2,False,140,2,630,4,2008,16,5.886104,8.09421


Count of missing numbers after SPSt0_mean_sum in validation part 1 in column Lag0 is 1027
Count of missing numbers after SPSt0_mean_sum in validation part 2 in column Lag0 is 7869
SP0_mean_sum is not in columns..


Unnamed: 0_level_0,SP0_mean_sum
Producto_ID,Unnamed: 1_level_1
41,24.452719
53,27.663352


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID,Lag0,SP0_mean_sum
0,3,1110,7,3301,15766,1212,3,25.139999,0,0.0,1.386294,False,120,2,709,4,2008,16,4.787492,5.969932
1,3,1110,7,3301,15766,1216,4,33.52,0,0.0,1.609438,False,135,2,712,4,2008,16,5.886104,6.114813


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID,Lag0,SP0_mean_sum
52449630,8,1110,7,3301,15766,1212,4,33.52,0,0.0,4,False,120,2,709,4,2008,16,4.787492,5.969932
52449631,8,1110,7,3301,15766,1216,5,41.900002,0,0.0,5,False,135,2,712,4,2008,16,5.886104,6.114813


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID,Lag0,SP0_mean_sum
62643206,9,1110,7,3301,15766,1212,1,8.38,0,0.0,1,False,120,2,709,4,2008,16,4.787492,5.969932
62643207,9,1110,7,3301,15766,1238,2,19.66,0,0.0,2,False,140,2,630,4,2008,16,5.886104,6.33373


Count of missing numbers after SP0_mean_sum in validation part 1 in column Lag0 is 953
Count of missing numbers after SP0_mean_sum in validation part 2 in column Lag0 is 7584


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID,Lag0
0,3,1110,7,3301,15766,1212,3,25.139999,0,0.0,1.386294,False,120,2,709,4,2008,16,4.787492
1,3,1110,7,3301,15766,1216,4,33.52,0,0.0,1.609438,False,135,2,712,4,2008,16,5.886104


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID,Lag0
52449630,8,1110,7,3301,15766,1212,4,33.52,0,0.0,4,False,120,2,709,4,2008,16,4.787492
52449631,8,1110,7,3301,15766,1216,5,41.900002,0,0.0,5,False,135,2,712,4,2008,16,5.886104


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID,Lag0
62643206,9,1110,7,3301,15766,1212,1,8.38,0,0.0,1,False,120,2,709,4,2008,16,4.787492
62643207,9,1110,7,3301,15766,1238,2,19.66,0,0.0,2,False,140,2,630,4,2008,16,5.886104


In [25]:
FE.train[["Demanda_uni_equil","Lag0"]].head(40)

Unnamed: 0,Demanda_uni_equil,Lag0
0,1.386294,4.787492
1,1.609438,5.886104
2,1.609438,5.886104
3,1.609438,5.886104
4,1.386294,6.356108
5,1.791759,8.776476
6,1.386294,8.679312
7,1.94591,9.441452
8,1.609438,3.218876
9,1.94591,9.149316


In [28]:
FE.train.Lag0.values

array([ 4.78749174,  5.88610403,  5.88610403, ...,  4.39444915,
        4.39444915,  7.42714413])

In [26]:
FE.train.loc[:,"Lag0Exp"] = np.log1p( np.expm1(FE.train.Lag0.values)/5)

In [29]:
print ('RMSLE Score:', rmse(FE.train.Demanda_uni_equil, FE.train.Lag0Exp))

('RMSLE Score:', 4.3788164672311503)


In [31]:
FE.train.loc[0,"Lag0"] = np.nan

In [33]:
print FE.train["Lag0"].isnull().sum()

1


In [34]:
FE.train[:5]

Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID,Lag0,Lag0Exp
0,3,1110,7,3301,15766,1212,3,25.139999,0,0.0,1.386294,False,120,2,709,4,2008,16,,3.210844
1,3,1110,7,3301,15766,1216,4,33.52,0,0.0,1.609438,False,135,2,712,4,2008,16,5.886104,4.287716
2,3,1110,7,3301,15766,1238,4,39.32,0,0.0,1.609438,False,140,2,630,4,2008,16,5.886104,4.287716
3,3,1110,7,3301,15766,1240,4,33.52,0,0.0,1.609438,False,125,4,480,4,2008,16,5.886104,4.287716
4,3,1110,7,3301,15766,1242,3,22.92,0,0.0,1.386294,False,105,6,271,4,2008,16,6.356108,4.75359


In [42]:
FE.train["Lag0Divided"] = FE.train["Lag0"].loc[FE.train["Lag0"].notnull()]/5

In [43]:
FE.train[0:5]

Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,...,DemandaNotEqualTheDifferenceOfVentaUniAndDev,weight,pieces,Prod_name_ID,Brand_ID,Town_ID,State_ID,Lag0,Lag0Exp,Lag0Divided
0,3,1110,7,3301,15766,1212,3,25.139999,0,0.0,...,False,120,2,709,4,2008,16,,3.210844,
1,3,1110,7,3301,15766,1216,4,33.52,0,0.0,...,False,135,2,712,4,2008,16,5.886104,4.287716,1.177221
2,3,1110,7,3301,15766,1238,4,39.32,0,0.0,...,False,140,2,630,4,2008,16,5.886104,4.287716,1.177221
3,3,1110,7,3301,15766,1240,4,33.52,0,0.0,...,False,125,4,480,4,2008,16,5.886104,4.287716,1.177221
4,3,1110,7,3301,15766,1242,3,22.92,0,0.0,...,False,105,6,271,4,2008,16,6.356108,4.75359,1.271222
