In [1]:
import pickle
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.cross_validation import KFold, train_test_split
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.grid_search import GridSearchCV
from sklearn.datasets import load_iris, load_digits, load_boston
%matplotlib inline
import seaborn as sns
from sklearn.metrics import r2_score, mean_squared_error, make_scorer
from collections import OrderedDict
import gc
from sklearn import datasets, linear_model,preprocessing
from IPython.display import display, HTML
import time
import nltk.corpus
from nltk.stem.snowball import SnowballStemmer

In [2]:
#take 1 CSV, then split it to 3..
class FeatureEngineering:

    def __init__(self, ValidationStart, ValidationEnd, trainHdfPath, trainHdfFile, testHdfPath1, testHdfPath2, testHdfFile, 
                 testTypes, trainTypes, trainCsvPath, testCsvPath, maxLag=0):
        self.ValidationStart = ValidationStart
        self.ValidationEnd = ValidationEnd
        self.maxLag = maxLag
        self.trainHdfPath = trainHdfPath
        self.trainHdfFile = trainHdfFile
        self.testHdfPath1 = testHdfPath1
        self.testHdfPath2 = testHdfPath2
        self.testHdfFile = testHdfFile
        self.testTypes = testTypes
        self.trainTypes = trainTypes
        self.trainCsvPath = trainCsvPath
        self.testCsvPath = testCsvPath
        
    @staticmethod
    def __printDataFrameBasics__(data):
        display(data.head(2))
        #print data.dtypes
        gc.collect()
        print(data.info(memory_usage=True))
        
    @staticmethod    
    def changeIndexTypeToLowerMemory(data):
        ##########
        #This is very critical, i accept max number is 2^32. Also, if don't do that, memory gets so much higher..
        ##########
        #data.reset_index(inplace=True)
        #data.drop("index",axis=1, inplace=True)
        #data.index = data.index.astype('uint32')
        gc.collect()
        
    def ReadHdf(self, trainOrTestOrBoth):
        '''Reads and holds Df in object memory'''            
        if trainOrTestOrBoth == 'train' or trainOrTestOrBoth=='both':
            self.train = pd.read_hdf(self.trainHdfPath,self.trainHdfFile)
            FeatureEngineering.changeIndexTypeToLowerMemory(self.train)
            FeatureEngineering.__printDataFrameBasics__(self.train)
            
        if trainOrTestOrBoth == 'test' or trainOrTestOrBoth=='both':
            self.test1 = pd.read_hdf(self.testHdfPath1,self.testHdfFile)
            self.test2 = pd.read_hdf(self.testHdfPath2,self.testHdfFile)
            FeatureEngineering.changeIndexTypeToLowerMemory(self.test1)
            FeatureEngineering.changeIndexTypeToLowerMemory(self.test2)
            FeatureEngineering.__printDataFrameBasics__(self.test1)
            FeatureEngineering.__printDataFrameBasics__(self.test2)
        
    def ReadCsv(self, trainOrTestOrBoth):
        '''Reads and holds Df in memory'''
        if trainOrTestOrBoth == 'train' or trainOrTestOrBoth == 'both':
            self.train = pd.read_csv(self.trainCsvPath, usecols=self.trainTypes.keys(), dtype=self.trainTypes)
            FeatureEngineering.changeIndexTypeToLowerMemory(self.train)
            FeatureEngineering.__printDataFrameBasics__(self.train)
        if trainOrTestOrBoth == 'test' or trainOrTestOrBoth=='both':
            tempTest = pd.read_csv(self.testCsvPath, usecols=self.testTypes.keys(), dtype=self.testTypes)
            self.test1 = tempTest.loc[tempTest.Semana.values == self.ValidationStart]
            self.test2 = tempTest.loc[tempTest.Semana.values == self.ValidationEnd]
            del tempTest
            FeatureEngineering.changeIndexTypeToLowerMemory(self.test1)
            FeatureEngineering.changeIndexTypeToLowerMemory(self.test2)
            FeatureEngineering.__printDataFrameBasics__(self.test1)
            FeatureEngineering.__printDataFrameBasics__(self.test2)
            
    @staticmethod
    def ConvertCsvToHdf(csvPath, HdfPath, HdfName, ColumnTypeDict ):
        tempDf = pd.read_csv(csvPath, usecols=ColumnTypeDict.keys(), dtype=ColumnTypeDict,index=False)
        tempDf.to_hdf(HdfPath, HdfName, format='t')
        del tempDf
        gc.collect()
        print "ConvertCsvToHdf is done.."

    def Preprocess(self, trainOrTestOrBoth, columnFunctionTypeList):
        '''columnFunctionTypeList = [ ['C1',Func1,Type], ['C2',Func2,Type],..    ]'''
        for column, func, localType in columnFunctionTypeList:
            if trainOrTestOrBoth == 'train' or trainOrTestOrBoth =='both':
                self.train.loc[:,column] =  np.apply_along_axis(func,0,FE.train[column].values).astype(localType)
                #np.apply_along_axis(lambda x: x+1,0,FE.train["Semana"]).astype("int32")
            if trainOrTestOrBoth == 'test' or trainOrTestOrBoth == 'both':
                self.test1.loc[:,column] =  np.apply_along_axis(func,0,FE.test1[column].values).astype(localType)
                self.test2.loc[:,column] =  np.apply_along_axis(func,0,FE.test2[column].values).astype(localType)
        gc.collect()
        
    def SaveDataFrameToHdf(self,trainOrTestOrBoth):
        if trainOrTestOrBoth == 'train' or trainOrTestOrBoth=='both':
            self.train.to_hdf(self.trainHdfPath, self.trainHdfFile, format='t', index="False")
        if trainOrTestOrBoth == 'test' or trainOrTestOrBoth=='both':
            self.test1.to_hdf(self.testHdfPath1, self.testHdfFile, format='t', index="False")
            self.test2.to_hdf(self.testHdfPath2, self.testHdfFile, format='t', index="False")
        
    def AddDemandaGeneralMean(self,trainOrTestOrBoth): 
        #self.train.loc[:,"DemandaGeneralMean"] = self.train["Demanda_uni_equil"].loc[
         #   self.train.loc[:,'Semana'] < 10].mean().astype("float32")
            
        meanOfDemanda = self.train["Demanda_uni_equil"].values.mean().astype("float32")
        
        if trainOrTestOrBoth == 'train' or trainOrTestOrBoth=='both':
            self.train.loc[:,"DemandaGeneralMean"] = meanOfDemanda
        if trainOrTestOrBoth == 'test' or trainOrTestOrBoth=='both':
            self.test1.loc[:,"DemandaGeneralMean"] = meanOfDemanda
            self.test2.loc[:,"DemandaGeneralMean"] = meanOfDemanda
        
        #self.train.loc[:,"DemandaGeneralMean"] = self.train["Demanda_uni_equil"].values[
        #(self.train.loc[:,'Semana'].values < self.ValidationStart).values].mean().astype("float32")
        gc.collect()
        
    '''ConfigElements(0,[ ("A",["Semana","Agencia_ID"],["count","count"]),'''
    def AddConfigurableFeaturesToTrain(self, config):
        if config.lag > self.maxLag:
            self.maxLag = config.lag
        
        tempData = self.train[self.train['Semana'].values <= (self.ValidationEnd - config.lag)]
        #display(tempData)
        if(config.lag != 0):
            tempData.loc[:,'Semana'] = tempData['Semana'].values + config.lag
        #display(tempData)
        
        #Means iterative.. eliminate as long as np.nan exists..If there is already one, don't create, use the existing
        if config.targetVariable != "" and  config.targetVariable not in self.train.columns:
            self.train.loc[:,config.targetVariable] = np.nan
            self.test1.loc[:,config.targetVariable] = np.nan
            
            if config.lag != 1:
                self.test2.loc[:,config.targetVariable] = np.nan
        
        for name,groups,aggregate in config.nameAndGroups:
            if name not in self.train.columns:
                print "{} is not in columns..".format(name)            
                
                groupedDataframe = tempData[groups+['Demanda_uni_equil']].copy().groupby(groups).agg(aggregate[0])
                gc.collect()
                #groupedDataframe.columns = groupedDataframe.columns.droplevel(0)
                groupedDataframe.columns = [name]
                
                #This is means of the counts of the semana-columns tuples!..!!!
                #If no lag and mean, mean of the columns without semana!!..
                #If there is lag and count, count of the columns x weeks before
                #If there is lag and mean, mean of the columns x weeks before
                #if(config.lag == 0 and aggregate == "count"):
                if(len(aggregate)>1):
                    groupedDataframe.reset_index(inplace=True)
                    groupedDataframe.drop("Semana",axis=1, inplace=True)
                    groups = groups[1:]
                    groupedDataframe = groupedDataframe.groupby(groups).agg(aggregate[1])
                    groupedDataframe.columns = [name]
                    gc.collect()
                
                display(groupedDataframe)
                self.train = self.train.merge( groupedDataframe, left_on=groups,
                    right_index=True, how='left', sort=False,copy=False)
                gc.collect()
                self.test1 = self.test1.merge( groupedDataframe, left_on=groups,
                    right_index=True, how='left', sort=False,copy=False)
                gc.collect()
                if config.lag != 1:
                    self.test2 = self.test2.merge( groupedDataframe, left_on=groups,
                        right_index=True, how='left', sort=False,copy=False)
                
                del groupedDataframe
                gc.collect()
            else:
                print "{} is in columns..".format(name)
            
            display(self.train)
            display(self.test1)
            display(self.test2)
            
            #Means iterative..!!!!!
            if config.targetVariable != "":
                self.train.loc[pd.isnull(self.train[config.targetVariable].values), 
                    config.targetVariable] = self.train.loc[pd.isnull(self.train[config.targetVariable].values)
                    , name].values
                self.test1.loc[pd.isnull(self.test1[config.targetVariable].values), 
                    config.targetVariable] = self.test1.loc[pd.isnull(self.test1[config.targetVariable].values),
                    name].values
                if config.lag != 1:
                    self.test2.loc[pd.isnull(self.test2[config.targetVariable].values), 
                        config.targetVariable] = self.test2.loc[pd.isnull(self.test2[config.targetVariable].values)
                        , name].values
                    
                count = self.test1[config.targetVariable].isnull().sum()
                print "Count of missing numbers after {} in validation part 1 in column {} is {}".format(name, 
                    config.targetVariable,str(count))
                if config.lag != 1:
                    count = self.test2.loc[:,config.targetVariable].isnull().sum()
                    print "Count of missing numbers after {} in validation part 2 in column {} is {}".format(name, 
                        config.targetVariable,str(count))
                
                
                #display(self.train)
                #If column is already in Dataframe and we want to fill target variable, this deletes columns!!!
                if(config.deleteColumns):
                    self.train.drop(name, axis=1, inplace=True)
                    self.test1.drop(name, axis=1, inplace=True)
                    if config.lag != 1:
                        self.test2.drop(name, axis=1, inplace=True)
                gc.collect()
                #Only in tesst
                #if count == 0:
                 #   break
        del tempData
        display(self.train)   
        display(self.test1)   
        display(self.test2)
        gc.collect()
        return 
    
    def DeleteLaggedWeeksFromTrain(self):
        self.train = self.train[self.train['Semana'].values >= (3 + self.maxLag)]
        gc.collect()
        display(self.train.head(2))
        
    def ReadFirstNRowsOfACsv(self, nrows, trainOrTestOrBoth) :
        if trainOrTestOrBoth == 'train' or trainOrTestOrBoth=='both':
            self.train = pd.read_csv(self.trainCsvPath, usecols=self.trainTypes.keys(), dtype=self.trainTypes, nrows = nrows)
            FeatureEngineering.changeIndexTypeToLowerMemory(self.train)
            FeatureEngineering.__printDataFrameBasics__(self.train)
        if trainOrTestOrBoth == 'test' or trainOrTestOrBoth=='both':
            tempTest = pd.read_csv(self.testCsvPath, usecols=self.testTypes.keys(), dtype=self.testTypes, nrows = nrows*2)
            self.test1 = tempTest.loc[tempTest.Semana == self.ValidationStart]
            self.test2 = tempTest.loc[tempTest.Semana == self.ValidationEnd]
            del tempTest
            FeatureEngineering.changeIndexTypeToLowerMemory(self.test1)
            FeatureEngineering.changeIndexTypeToLowerMemory(self.test2)
            FeatureEngineering.__printDataFrameBasics__(self.test1)
            FeatureEngineering.__printDataFrameBasics__(self.test2)
    
    #Use when concatanating train and validation before predict test for example..
    def AppendTestToTrain(self,deleteTest = True):
        self.train = self.train.append(self.test1,ignore_index=True)
        gc.collect()
        if(deleteTest):
            del self.test1
            gc.collect()
        try:
            self.train = self.train.append(self.test2,ignore_index=True)
            gc.collect()
            if(deleteTest):
                del self.test2
                gc.collect()
        except:
            pass
        #BAD PERFORMANCE!!
    #Split train data to train and test1 and test2 (validation)
    #def SplitTrainToTestUsingValidationStart(self):
     #   boolCondition = self.train.Semana == self.ValidationStart
      #  self.test1 = self.train.loc[boolCondition]
       # self.train.drop((self.train.loc[boolCondition].index), axis=0,inplace=True)
        
       # boolCondition = self.train.Semana == self.ValidationEnd
       # self.test2 = self.train.loc[boolCondition]
       # self.train.drop((self.train.loc[boolCondition].index), axis=0,inplace=True)
      #  del boolCondition
      #  gc.collect()
    
    #Reaches 3x memory from train, because of test1, test2 and train itself at the end.. GC fixed in the end..
    def SplitTrainToTestUsingValidationStart(self):
        boolCondition = self.train.Semana.values == self.ValidationStart
        self.test1 = self.train[boolCondition]
        boolCondition = self.train.Semana.values == self.ValidationEnd
        self.test2 = self.train[boolCondition]
        FE.train = FE.train[ FE.train.Semana.values < FE.ValidationStart ]
        del boolCondition
        gc.collect()
        
    def XgboostPredictAndSee(self, Test1OrTest2):
        self.train_y = self.train["Demanda_uni_equil"].copy()
        self.train.drop("Demanda_uni_equil",axis=1, inplace=True)
        

In [3]:
parameterDict =       {"ValidationStart":8, 
 "ValidationEnd":9,
   "maxLag":3,
    "trainHdfPath":'../../input/train.h5',
    "trainHdfFile":"train",
    "testHdfPath1":"../../input/test1.h5",
    "testHdfPath2":"../../input/test2.h5",
    "testHdfFile":"test", 
    "trainTypes" : {'Semana':np.uint8, 'Agencia_ID':np.uint16, 'Canal_ID':np.uint8,'Ruta_SAK':np.uint16, 
        'Cliente_ID':np.uint32, 'Producto_ID':np.uint16,'Venta_uni_hoy':np.uint16, 'Venta_hoy':np.float32,
                    'Dev_uni_proxima': np.uint32, 'Dev_proxima':np.float32,'Demanda_uni_equil':np.uint32}, 
    "testTypes" : {'id':np.uint32,'Semana':np.uint8, 'Agencia_ID':np.uint16, 'Canal_ID':np.uint8,'Ruta_SAK':np.uint16,
        'Cliente_ID':np.uint32, 'Producto_ID':np.uint16},
    "trainCsvPath":'../../input/train.csv'   ,
    "testCsvPath":'../../input/test.csv'}

FE = FeatureEngineering(**parameterDict)
print FE.__dict__

{'trainCsvPath': '../../input/train.csv', 'maxLag': 3, 'testTypes': {'Cliente_ID': <type 'numpy.uint32'>, 'Ruta_SAK': <type 'numpy.uint16'>, 'Canal_ID': <type 'numpy.uint8'>, 'Producto_ID': <type 'numpy.uint16'>, 'Agencia_ID': <type 'numpy.uint16'>, 'Semana': <type 'numpy.uint8'>, 'id': <type 'numpy.uint32'>}, 'testHdfFile': 'test', 'trainTypes': {'Dev_proxima': <type 'numpy.float32'>, 'Venta_uni_hoy': <type 'numpy.uint16'>, 'Cliente_ID': <type 'numpy.uint32'>, 'Demanda_uni_equil': <type 'numpy.uint32'>, 'Ruta_SAK': <type 'numpy.uint16'>, 'Canal_ID': <type 'numpy.uint8'>, 'Venta_hoy': <type 'numpy.float32'>, 'Producto_ID': <type 'numpy.uint16'>, 'Agencia_ID': <type 'numpy.uint16'>, 'Dev_uni_proxima': <type 'numpy.uint32'>, 'Semana': <type 'numpy.uint8'>}, 'testHdfPath1': '../../input/test1.h5', 'ValidationEnd': 9, 'testHdfPath2': '../../input/test2.h5', 'testCsvPath': '../../input/test.csv', 'ValidationStart': 8, 'trainHdfFile': 'train', 'trainHdfPath': '../../input/train.h5'}


In [5]:
FE.ReadHdf('both')

Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,...,Town_ID,State_ID,DemandaGeneralMean,Lag0,Lag0Averaged,Demanda_uni_equilLogged,Lag0AveragedLogged,Lag1,Lag2,Lag3
0,3,1110,7,3301,15766,1212,3,25.139999,0,0.0,...,2008,16,7.346896,12.0,2.0,1.386294,1.098612,,,
1,3,1110,7,3301,15766,1216,4,33.52,0,0.0,...,2008,16,7.346896,12.0,2.0,1.609438,1.098612,,,


<class 'pandas.core.frame.DataFrame'>
Int64Index: 52449630 entries, 0 to 52449629
Data columns (total 26 columns):
Semana                                          uint8
Agencia_ID                                      uint16
Canal_ID                                        uint8
Ruta_SAK                                        uint16
Cliente_ID                                      uint32
Producto_ID                                     uint16
Venta_uni_hoy                                   uint16
Venta_hoy                                       float32
Dev_uni_proxima                                 uint32
Dev_proxima                                     float32
Demanda_uni_equil                               uint32
DemandaNotEqualTheDifferenceOfVentaUniAndDev    bool
weight                                          uint16
pieces                                          uint8
Prod_name_ID                                    uint16
Brand_ID                                        uint8
Town_ID  

Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,...,Town_ID,State_ID,DemandaGeneralMean,Lag0,Lag0Averaged,Demanda_uni_equilLogged,Lag0AveragedLogged,Lag1,Lag2,Lag3
52449630,8,1110,7,3301,15766,1212,4,33.52,0,0.0,...,2008,16,7.346896,12.0,2.0,1.609438,1.098612,5.864407,5.931034,5.0
52449631,8,1110,7,3301,15766,1216,5,41.900002,0,0.0,...,2008,16,7.346896,12.0,2.0,1.791759,1.098612,2.0,1.0,3.0


<class 'pandas.core.frame.DataFrame'>
Int64Index: 10193576 entries, 52449630 to 62643205
Data columns (total 26 columns):
Semana                                          uint8
Agencia_ID                                      uint16
Canal_ID                                        uint8
Ruta_SAK                                        uint16
Cliente_ID                                      uint32
Producto_ID                                     uint16
Venta_uni_hoy                                   uint16
Venta_hoy                                       float32
Dev_uni_proxima                                 uint32
Dev_proxima                                     float32
Demanda_uni_equil                               uint32
DemandaNotEqualTheDifferenceOfVentaUniAndDev    bool
weight                                          uint16
pieces                                          uint8
Prod_name_ID                                    uint16
Brand_ID                                        uint8
To

Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,...,Brand_ID,Town_ID,State_ID,DemandaGeneralMean,Lag0,Lag0Averaged,Demanda_uni_equilLogged,Lag0AveragedLogged,Lag2,Lag3
62643206,9,1110,7,3301,15766,1212,1,8.38,0,0.0,...,4,2008,16,7.346896,12.0,2.0,0.693147,1.098612,5.864407,5.931034
62643207,9,1110,7,3301,15766,1238,2,19.66,0,0.0,...,4,2008,16,7.346896,12.0,2.0,1.098612,1.098612,2.0,2.0


<class 'pandas.core.frame.DataFrame'>
Int64Index: 10200437 entries, 62643206 to 72843642
Data columns (total 25 columns):
Semana                                          uint8
Agencia_ID                                      uint16
Canal_ID                                        uint8
Ruta_SAK                                        uint16
Cliente_ID                                      uint32
Producto_ID                                     uint16
Venta_uni_hoy                                   uint16
Venta_hoy                                       float32
Dev_uni_proxima                                 uint32
Dev_proxima                                     float32
Demanda_uni_equil                               uint32
DemandaNotEqualTheDifferenceOfVentaUniAndDev    bool
weight                                          uint16
pieces                                          uint8
Prod_name_ID                                    uint16
Brand_ID                                        uint8
To

In [5]:
FE.train = FE.test1
print FE.train.columns
FE.train.head(2)

Index([u'Semana', u'Agencia_ID', u'Canal_ID', u'Ruta_SAK', u'Cliente_ID',
       u'Producto_ID', u'Venta_uni_hoy', u'Venta_hoy', u'Dev_uni_proxima',
       u'Dev_proxima', u'Demanda_uni_equil',
       u'DemandaNotEqualTheDifferenceOfVentaUniAndDev', u'weight', u'pieces',
       u'Prod_name_ID', u'Brand_ID', u'Town_ID', u'State_ID',
       u'DemandaGeneralMean', u'Lag0', u'Lag0Averaged',
       u'Demanda_uni_equilLogged', u'Lag0AveragedLogged', u'Lag1', u'Lag2',
       u'Lag3'],
      dtype='object')


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,...,Town_ID,State_ID,DemandaGeneralMean,Lag0,Lag0Averaged,Demanda_uni_equilLogged,Lag0AveragedLogged,Lag1,Lag2,Lag3
52449630,8,1110,7,3301,15766,1212,4,33.52,0,0.0,...,2008,16,7.346896,12.0,2.0,1.609438,1.098612,5.864407,5.931034,5.0
52449631,8,1110,7,3301,15766,1216,5,41.900002,0,0.0,...,2008,16,7.346896,12.0,2.0,1.791759,1.098612,2.0,1.0,3.0


## To make the data same with Base one..DON'T DO THIS ON PROD!!

In [6]:
FE.train[["Demanda_uni_equil","Demanda_uni_equilLogged"]].head(5)

Unnamed: 0,Demanda_uni_equil,Demanda_uni_equilLogged
52449630,4,1.609438
52449631,5,1.791759
52449632,1,0.693147
52449633,3,1.386294
52449634,2,1.098612


In [7]:
FE.train.loc[:,"Demanda_uni_equil"] =  FE.train["Demanda_uni_equilLogged"].values.copy()

## Producto_ID_sum_demanda_divide_sum_venta_uni

In [8]:
Feature1 = FE.train[["Producto_ID","Demanda_uni_equil","Venta_uni_hoy"]].copy()
gc.collect()
display(Feature1.head(2))
print "Venta_Uni_Hoy = 0: " ,((Feature1.Venta_uni_hoy == 0).sum())
print "Shape of New Dataframe..: ", str((Feature1.shape[0]))
gc.collect()
Feature1 = Feature1[Feature1.Venta_uni_hoy != 0]
print "Shape of New Dataframe after deleting Venta_Uni_Hoy = 0..: ",(Feature1.shape[0])
gc.collect()
Feature1.loc[:,"Demanda_uni_equil"] = np.round( np.expm1(Feature1["Demanda_uni_equil"].values) )
display(Feature1[Feature1.Demanda_uni_equil.values != Feature1.Venta_uni_hoy.values].head(10))
Feature1 = Feature1.groupby("Producto_ID").sum()
gc.collect()
Feature1.loc[:,"Producto_ID_sum_demanda_divide_sum_venta_uni"] = Feature1.Demanda_uni_equil.values / Feature1.Venta_uni_hoy.values
Feature1 = pd.DataFrame(Feature1["Producto_ID_sum_demanda_divide_sum_venta_uni"])
gc.collect()
print Feature1.mean()
display(Feature1.head(10))

Unnamed: 0,Producto_ID,Demanda_uni_equil,Venta_uni_hoy
52449630,1212,1.609438,4
52449631,1216,1.791759,5


Venta_Uni_Hoy = 0:  0
Shape of New Dataframe..:  10193576
Shape of New Dataframe after deleting Venta_Uni_Hoy = 0..:  10193576


Unnamed: 0,Producto_ID,Demanda_uni_equil,Venta_uni_hoy
52449747,1238,1.0,2
52449748,2233,18.0,19
52449758,32393,17.0,18
52449839,34053,4.0,9
52450060,34053,3.0,5
52450641,2233,10.0,11
52450870,1109,1.0,2
52450945,1150,10.0,22
52450946,1160,1.0,7
52450985,47336,11.0,12


Producto_ID_sum_demanda_divide_sum_venta_uni    0.982182
dtype: float64


Unnamed: 0_level_0,Producto_ID_sum_demanda_divide_sum_venta_uni
Producto_ID,Unnamed: 1_level_1
41,1.0
53,1.0
72,0.994464
73,0.993024
100,1.0
106,0.995536
108,1.0
123,0.997915
131,1.0
132,1.0


In [9]:
FE.train = FE.train.merge( Feature1, left_on="Producto_ID",
                    right_index=True, how='left', sort=False,copy=False)
FE.test1 = FE.test1.merge( Feature1, left_on="Producto_ID",
                    right_index=True, how='left', sort=False,copy=False)
FE.test2 = FE.test2.merge( Feature1, left_on="Producto_ID",
                    right_index=True, how='left', sort=False,copy=False)

In [10]:
display(FE.train.head(2))
display(FE.test1.head(2))
display(FE.test2.head(2))

Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,...,State_ID,DemandaGeneralMean,Lag0,Lag0Averaged,Demanda_uni_equilLogged,Lag0AveragedLogged,Lag1,Lag2,Lag3,Producto_ID_sum_demanda_divide_sum_venta_uni
52449630,8,1110,7,3301,15766,1212,4,33.52,0,0.0,...,16,7.346896,12.0,2.0,1.609438,1.098612,5.864407,5.931034,5.0,0.98995
52449631,8,1110,7,3301,15766,1216,5,41.900002,0,0.0,...,16,7.346896,12.0,2.0,1.791759,1.098612,2.0,1.0,3.0,0.991494


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,...,State_ID,DemandaGeneralMean,Lag0,Lag0Averaged,Demanda_uni_equilLogged,Lag0AveragedLogged,Lag1,Lag2,Lag3,Producto_ID_sum_demanda_divide_sum_venta_uni
52449630,8,1110,7,3301,15766,1212,4,33.52,0,0.0,...,16,7.346896,12.0,2.0,1.609438,1.098612,5.864407,5.931034,5.0,0.98995
52449631,8,1110,7,3301,15766,1216,5,41.900002,0,0.0,...,16,7.346896,12.0,2.0,1.791759,1.098612,2.0,1.0,3.0,0.991494


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,...,Town_ID,State_ID,DemandaGeneralMean,Lag0,Lag0Averaged,Demanda_uni_equilLogged,Lag0AveragedLogged,Lag2,Lag3,Producto_ID_sum_demanda_divide_sum_venta_uni
62643206,9,1110,7,3301,15766,1212,1,8.38,0,0.0,...,2008,16,7.346896,12.0,2.0,0.693147,1.098612,5.864407,5.931034,0.98995
62643207,9,1110,7,3301,15766,1238,2,19.66,0,0.0,...,2008,16,7.346896,12.0,2.0,1.098612,1.098612,2.0,2.0,0.988582


## Prod_name_ID_sum_demanda_divide_sum_venta_uni

In [11]:
Feature1 = FE.train[["Prod_name_ID","Demanda_uni_equil","Venta_uni_hoy"]].copy()
gc.collect()
display(Feature1.head(2))
print "Venta_Uni_Hoy = 0: " ,((Feature1.Venta_uni_hoy == 0).sum())
print "Shape of New Dataframe..: ", str((Feature1.shape[0]))
gc.collect()
Feature1 = Feature1[Feature1.Venta_uni_hoy != 0]
print "Shape of New Dataframe after deleting Venta_Uni_Hoy = 0..: ",(Feature1.shape[0])
gc.collect()
Feature1.loc[:,"Demanda_uni_equil"] = np.round( np.expm1(Feature1["Demanda_uni_equil"].values) )
display(Feature1[Feature1.Demanda_uni_equil.values != Feature1.Venta_uni_hoy.values].head(10))
Feature1 = Feature1.groupby("Prod_name_ID").sum()
gc.collect()
Feature1.loc[:,"Prod_name_ID_sum_demanda_divide_sum_venta_uni"] = Feature1.Demanda_uni_equil.values / Feature1.Venta_uni_hoy.values
Feature1 = pd.DataFrame(Feature1["Prod_name_ID_sum_demanda_divide_sum_venta_uni"])
gc.collect()
print Feature1.mean()
display(Feature1.head(10))

Unnamed: 0,Prod_name_ID,Demanda_uni_equil,Venta_uni_hoy
52449630,709,1.609438,4
52449631,712,1.791759,5


Venta_Uni_Hoy = 0:  0
Shape of New Dataframe..:  10193576
Shape of New Dataframe after deleting Venta_Uni_Hoy = 0..:  10193576


Unnamed: 0,Prod_name_ID,Demanda_uni_equil,Venta_uni_hoy
52449747,630,1.0,2
52449748,567,18.0,19
52449758,699,17.0,18
52449839,544,4.0,9
52450060,544,3.0,5
52450641,567,10.0,11
52450870,569,1.0,2
52450945,492,10.0,22
52450946,73,1.0,7
52450985,497,11.0,12


Prod_name_ID_sum_demanda_divide_sum_venta_uni    0.983653
dtype: float64


Unnamed: 0_level_0,Prod_name_ID_sum_demanda_divide_sum_venta_uni
Prod_name_ID,Unnamed: 1_level_1
0,0.987893
1,0.946614
2,0.943114
3,0.971968
4,0.940131
5,0.943987
6,0.935897
9,0.970782
10,0.97209
11,0.986293


In [12]:
FE.train = FE.train.merge( Feature1, left_on="Prod_name_ID",
                    right_index=True, how='left', sort=False,copy=False)
FE.test1 = FE.test1.merge( Feature1, left_on="Prod_name_ID",
                    right_index=True, how='left', sort=False,copy=False)
FE.test2 = FE.test2.merge( Feature1, left_on="Prod_name_ID",
                    right_index=True, how='left', sort=False,copy=False)

In [13]:
display(FE.train.head(2))
display(FE.test1.head(2))
display(FE.test2.head(2))

Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,...,DemandaGeneralMean,Lag0,Lag0Averaged,Demanda_uni_equilLogged,Lag0AveragedLogged,Lag1,Lag2,Lag3,Producto_ID_sum_demanda_divide_sum_venta_uni,Prod_name_ID_sum_demanda_divide_sum_venta_uni
52449630,8,1110,7,3301,15766,1212,4,33.52,0,0.0,...,7.346896,12.0,2.0,1.609438,1.098612,5.864407,5.931034,5.0,0.98995,0.988656
52449631,8,1110,7,3301,15766,1216,5,41.900002,0,0.0,...,7.346896,12.0,2.0,1.791759,1.098612,2.0,1.0,3.0,0.991494,0.98998


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,...,DemandaGeneralMean,Lag0,Lag0Averaged,Demanda_uni_equilLogged,Lag0AveragedLogged,Lag1,Lag2,Lag3,Producto_ID_sum_demanda_divide_sum_venta_uni,Prod_name_ID_sum_demanda_divide_sum_venta_uni
52449630,8,1110,7,3301,15766,1212,4,33.52,0,0.0,...,7.346896,12.0,2.0,1.609438,1.098612,5.864407,5.931034,5.0,0.98995,0.988656
52449631,8,1110,7,3301,15766,1216,5,41.900002,0,0.0,...,7.346896,12.0,2.0,1.791759,1.098612,2.0,1.0,3.0,0.991494,0.98998


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,...,State_ID,DemandaGeneralMean,Lag0,Lag0Averaged,Demanda_uni_equilLogged,Lag0AveragedLogged,Lag2,Lag3,Producto_ID_sum_demanda_divide_sum_venta_uni,Prod_name_ID_sum_demanda_divide_sum_venta_uni
62643206,9,1110,7,3301,15766,1212,1,8.38,0,0.0,...,16,7.346896,12.0,2.0,0.693147,1.098612,5.864407,5.931034,0.98995,0.988656
62643207,9,1110,7,3301,15766,1238,2,19.66,0,0.0,...,16,7.346896,12.0,2.0,1.098612,1.098612,2.0,2.0,0.988582,0.988461


## Cliente_ID_sum_demanda_divide_sum_venta_uni

In [14]:
Feature1 = FE.train[["Cliente_ID","Demanda_uni_equil","Venta_uni_hoy"]].copy()
gc.collect()
display(Feature1.head(2))
print "Venta_Uni_Hoy = 0: " ,((Feature1.Venta_uni_hoy == 0).sum())
print "Shape of New Dataframe..: ", str((Feature1.shape[0]))
gc.collect()
print "Shape of New Dataframe after deleting Venta_Uni_Hoy = 0..: ",(Feature1.shape[0])
print (Feature1.shape[0])
gc.collect()
Feature1.loc[:,"Demanda_uni_equil"] = np.round( np.expm1(Feature1["Demanda_uni_equil"].values) )
display(Feature1[Feature1.Demanda_uni_equil.values != Feature1.Venta_uni_hoy.values].head(10))
Feature1 = Feature1.groupby("Cliente_ID").sum()
gc.collect()
Feature1.loc[:,"Cliente_ID_sum_demanda_divide_sum_venta_uni"] = Feature1.Demanda_uni_equil.values / Feature1.Venta_uni_hoy.values
Feature1 = pd.DataFrame(Feature1["Cliente_ID_sum_demanda_divide_sum_venta_uni"])
gc.collect()
print Feature1.mean()
display(Feature1.head(10))

Unnamed: 0,Cliente_ID,Demanda_uni_equil,Venta_uni_hoy
52449630,15766,1.609438,4
52449631,15766,1.791759,5


Venta_Uni_Hoy = 0:  0
Shape of New Dataframe..:  10193576
Shape of New Dataframe after deleting Venta_Uni_Hoy = 0..:  10193576
10193576


Unnamed: 0,Cliente_ID,Demanda_uni_equil,Venta_uni_hoy
52449747,319641,1.0,2
52449748,319641,18.0,19
52449758,319641,17.0,18
52449839,1159580,4.0,9
52450060,4532486,3.0,5
52450641,1492120,10.0,11
52450870,4316732,1.0,2
52450945,817296,10.0,22
52450946,817296,1.0,7
52450985,1030969,11.0,12


Cliente_ID_sum_demanda_divide_sum_venta_uni    0.993787
dtype: float64


Unnamed: 0_level_0,Cliente_ID_sum_demanda_divide_sum_venta_uni
Cliente_ID,Unnamed: 1_level_1
26,0.995449
60,1.0
65,1.0
101,1.0
105,1.0
106,1.0
107,0.966245
465,1.0
772,1.0
786,1.0


In [15]:
FE.train = FE.train.merge( Feature1, left_on="Cliente_ID",
                    right_index=True, how='left', sort=False,copy=False)
FE.test1 = FE.test1.merge( Feature1, left_on="Cliente_ID",
                    right_index=True, how='left', sort=False,copy=False)
FE.test2 = FE.test2.merge( Feature1, left_on="Cliente_ID",
                    right_index=True, how='left', sort=False,copy=False)

In [16]:
display(FE.train.head(2))
display(FE.test1.head(2))
display(FE.test2.head(2))

Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,...,Lag0,Lag0Averaged,Demanda_uni_equilLogged,Lag0AveragedLogged,Lag1,Lag2,Lag3,Producto_ID_sum_demanda_divide_sum_venta_uni,Prod_name_ID_sum_demanda_divide_sum_venta_uni,Cliente_ID_sum_demanda_divide_sum_venta_uni
52449630,8,1110,7,3301,15766,1212,4,33.52,0,0.0,...,12.0,2.0,1.609438,1.098612,5.864407,5.931034,5.0,0.98995,0.988656,1.0
52449631,8,1110,7,3301,15766,1216,5,41.900002,0,0.0,...,12.0,2.0,1.791759,1.098612,2.0,1.0,3.0,0.991494,0.98998,1.0


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,...,Lag0,Lag0Averaged,Demanda_uni_equilLogged,Lag0AveragedLogged,Lag1,Lag2,Lag3,Producto_ID_sum_demanda_divide_sum_venta_uni,Prod_name_ID_sum_demanda_divide_sum_venta_uni,Cliente_ID_sum_demanda_divide_sum_venta_uni
52449630,8,1110,7,3301,15766,1212,4,33.52,0,0.0,...,12.0,2.0,1.609438,1.098612,5.864407,5.931034,5.0,0.98995,0.988656,1.0
52449631,8,1110,7,3301,15766,1216,5,41.900002,0,0.0,...,12.0,2.0,1.791759,1.098612,2.0,1.0,3.0,0.991494,0.98998,1.0


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,...,DemandaGeneralMean,Lag0,Lag0Averaged,Demanda_uni_equilLogged,Lag0AveragedLogged,Lag2,Lag3,Producto_ID_sum_demanda_divide_sum_venta_uni,Prod_name_ID_sum_demanda_divide_sum_venta_uni,Cliente_ID_sum_demanda_divide_sum_venta_uni
62643206,9,1110,7,3301,15766,1212,1,8.38,0,0.0,...,7.346896,12.0,2.0,0.693147,1.098612,5.864407,5.931034,0.98995,0.988656,1.0
62643207,9,1110,7,3301,15766,1238,2,19.66,0,0.0,...,7.346896,12.0,2.0,1.098612,1.098612,2.0,2.0,0.988582,0.988461,1.0


In [30]:
#pd.DataFrame({"a":FE.train[FE.train.Cliente_ID == 15766]["Venta_uni_hoy"], 
 #   "b":np.expm1(FE.train[FE.train.Cliente_ID == 15766]["Demanda_uni_equil"])})


## Prod_name_ID_returnUnsoldProducts_AfterOneWeek

In [4]:
FE.ReadCsv('both')

Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil
0,3,1110,7,3301,15766,1212,3,25.139999,0,0.0,3
1,3,1110,7,3301,15766,1216,4,33.52,0,0.0,4


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74180464 entries, 0 to 74180463
Data columns (total 11 columns):
Semana               uint8
Agencia_ID           uint16
Canal_ID             uint8
Ruta_SAK             uint16
Cliente_ID           uint32
Producto_ID          uint16
Venta_uni_hoy        uint16
Venta_hoy            float32
Dev_uni_proxima      uint32
Dev_proxima          float32
Demanda_uni_equil    uint32
dtypes: float32(2), uint16(4), uint32(3), uint8(2)
memory usage: 2.1 GB
None


Unnamed: 0,id,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID


<class 'pandas.core.frame.DataFrame'>
Int64Index: 0 entries
Data columns (total 7 columns):
id             0 non-null uint32
Semana         0 non-null uint8
Agencia_ID     0 non-null uint16
Canal_ID       0 non-null uint8
Ruta_SAK       0 non-null uint16
Cliente_ID     0 non-null uint32
Producto_ID    0 non-null uint16
dtypes: uint16(3), uint32(2), uint8(2)
memory usage: 0.0 bytes
None


Unnamed: 0,id,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID


<class 'pandas.core.frame.DataFrame'>
Int64Index: 0 entries
Data columns (total 7 columns):
id             0 non-null uint32
Semana         0 non-null uint8
Agencia_ID     0 non-null uint16
Canal_ID       0 non-null uint8
Ruta_SAK       0 non-null uint16
Cliente_ID     0 non-null uint32
Producto_ID    0 non-null uint16
dtypes: uint16(3), uint32(2), uint8(2)
memory usage: 0.0 bytes
None


In [5]:
FE.SplitTrainToTestUsingValidationStart()

In [6]:
(FE.train.Demanda_uni_equil.values < (
    FE.train.Venta_uni_hoy.values - FE.train.Dev_uni_proxima.values)).sum()

416773

In [7]:
(FE.train.Demanda_uni_equil == 0 & (
    FE.train.Venta_uni_hoy == 0)).sum()

915253

In [18]:
(FE.train.Demanda_uni_equil != 0 & (
    FE.train.Venta_uni_hoy != 0)).sum()

52449630

In [15]:
FE.train.shape

(53364883, 11)

In [None]:
qq = pd.DataFrame({"a": [3,3,3,4,5], "b": [True,False,True,False,True]})

In [49]:
FE.train[FE.train.DemandaNotEqualTheDifferenceOfVentaUniAndDev]

Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,...,Lag0,Lag0Averaged,Demanda_uni_equilLogged,Lag0AveragedLogged,Lag1,Lag2,Lag3,Producto_ID_sum_demanda_divide_sum_venta_uni,Prod_name_ID_sum_demanda_divide_sum_venta_uni,Cliente_ID_sum_demanda_divide_sum_venta_uni


In [42]:
FE.train.DemandaNotEqualTheDifferenceOfVentaUniAndDev.nunique()

1

In [56]:
(np.round( np.expm1(FE.test2.Demanda_uni_equil.values)) < (
    FE.test2.Venta_uni_hoy.values - FE.test2.Dev_uni_proxima.values)).sum()

0

In [6]:
train = FE.train[["Producto_ID","Prod_name_ID","Demanda_uni_equil","Venta_uni_hoy","Dev_uni_proxima",
                  "DemandaNotEqualTheDifferenceOfVentaUniAndDev"]]
del FE.train
gc.collect()
test1 = FE.test1[["Producto_ID","Prod_name_ID","Demanda_uni_equil","Venta_uni_hoy","Dev_uni_proxima",
                  "DemandaNotEqualTheDifferenceOfVentaUniAndDev"]]
del FE.test1
gc.collect()
test2 = FE.test2[["Producto_ID","Prod_name_ID","Demanda_uni_equil","Venta_uni_hoy","Dev_uni_proxima",
                  "DemandaNotEqualTheDifferenceOfVentaUniAndDev"]]
del FE.test2
gc.collect()

256

In [11]:
(train.Demanda_uni_equil.values < (
    train.Venta_uni_hoy.values - train.Dev_uni_proxima.values)).sum()

0

In [12]:
(train.Demanda_uni_equil == 0 & (
    train.Venta_uni_hoy == 0)).sum()

0