In [1]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import datasets, linear_model, preprocessing
from datetime import datetime
import gc
%matplotlib inline
from IPython.display import display, HTML
from pprint import pprint
import nltk.corpus
from nltk.stem.snowball import SnowballStemmer

In [2]:
#take 1 CSV, then split it to 3..
class FeatureEngineering:

    def __init__(self, ValidationStart, ValidationEnd, trainHdfPath, trainHdfFile, testHdfPath1, testHdfPath2, testHdfFile, 
                 testTypes, trainTypes, trainCsvPath, testCsvPath, maxLag=0):
        self.ValidationStart = ValidationStart
        self.ValidationEnd = ValidationEnd
        self.maxLag = maxLag
        self.trainHdfPath = trainHdfPath
        self.trainHdfFile = trainHdfFile
        self.testHdfPath1 = testHdfPath1
        self.testHdfPath2 = testHdfPath2
        self.testHdfFile = testHdfFile
        self.testTypes = testTypes
        self.trainTypes = trainTypes
        self.trainCsvPath = trainCsvPath
        self.testCsvPath = testCsvPath
        
    @staticmethod
    def __printDataFrameBasics__(data):
        display(data.head(2))
        #print data.dtypes
        gc.collect()
        print(data.info(memory_usage=True))
        
    @staticmethod    
    def changeIndexTypeToLowerMemory(data):
        ##########
        #This is very critical, i accept max number is 2^32. Also, if don't do that, memory gets so much higher..
        ##########
        #data.reset_index(inplace=True)
        #data.drop("index",axis=1, inplace=True)
        data.index = data.index.astype('uint32')
        gc.collect()
        
    def ReadHdf(self, trainOrTestOrBoth):
        '''Reads and holds Df in object memory'''            
        if trainOrTestOrBoth == 'train' or trainOrTestOrBoth=='both':
            self.train = pd.read_hdf(self.trainHdfPath,self.trainHdfFile)
            FeatureEngineering.changeIndexTypeToLowerMemory(self.train)
            FeatureEngineering.__printDataFrameBasics__(self.train)
            
        if trainOrTestOrBoth == 'test' or trainOrTestOrBoth=='both':
            self.test1 = pd.read_hdf(self.testHdfPath1,self.testHdfFile)
            self.test2 = pd.read_hdf(self.testHdfPath2,self.testHdfFile)
            FeatureEngineering.changeIndexTypeToLowerMemory(self.test1)
            FeatureEngineering.changeIndexTypeToLowerMemory(self.test2)
            FeatureEngineering.__printDataFrameBasics__(self.test1)
            FeatureEngineering.__printDataFrameBasics__(self.test2)
        
    def ReadCsv(self, trainOrTestOrBoth):
        '''Reads and holds Df in memory'''
        if trainOrTestOrBoth == 'train' or trainOrTestOrBoth == 'both':
            self.train = pd.read_csv(self.trainCsvPath, usecols=self.trainTypes.keys(), dtype=self.trainTypes)
            FeatureEngineering.changeIndexTypeToLowerMemory(self.train)
            FeatureEngineering.__printDataFrameBasics__(self.train)
        if trainOrTestOrBoth == 'test' or trainOrTestOrBoth=='both':
            tempTest = pd.read_csv(self.testCsvPath, usecols=self.testTypes.keys(), dtype=self.testTypes)
            self.test1 = tempTest.loc[tempTest.Semana == self.ValidationStart]
            self.test2 = tempTest.loc[tempTest.Semana == self.ValidationEnd]
            del tempTest
            FeatureEngineering.changeIndexTypeToLowerMemory(self.test1)
            FeatureEngineering.changeIndexTypeToLowerMemory(self.test2)
            FeatureEngineering.__printDataFrameBasics__(self.test1)
            FeatureEngineering.__printDataFrameBasics__(self.test2)
            
    @staticmethod
    def ConvertCsvToHdf(csvPath, HdfPath, HdfName, ColumnTypeDict ):
        tempDf = pd.read_csv(csvPath, usecols=ColumnTypeDict.keys(), dtype=ColumnTypeDict,index=False)
        tempDf.to_hdf(HdfPath, HdfName, format='t')
        del tempDf
        gc.collect()
        print "ConvertCsvToHdf is done.."

    def Preprocess(self, trainOrTestOrBoth, columnFunctionTypeList):
        '''columnFunctionTypeList = [ ['C1',Func1,Type], ['C2',Func2,Type],..    ]'''
        for column, func, localType in columnFunctionTypeList:
            if trainOrTestOrBoth == 'train' or trainOrTestOrBoth =='both':
                self.train.loc[:,column] =  self.train[column].apply(func).astype(localType)
            if trainOrTestOrBoth == 'test' or trainOrTestOrBoth == 'both':
                self.test1.loc[:,column] =  self.test1[column].apply(func).astype(localType)
                self.test2.loc[:,column] =  self.test2[column].apply(func).astype(localType)
        gc.collect()
        
    def SaveDataFrameToHdf(self,trainOrTestOrBoth):
        if trainOrTestOrBoth == 'train' or trainOrTestOrBoth=='both':
            self.train.to_hdf(self.trainHdfPath, self.trainHdfFile, format='t', index="False")
        if trainOrTestOrBoth == 'test' or trainOrTestOrBoth=='both':
            self.test1.to_hdf(self.testHdfPath1, self.testHdfFile, format='t', index="False")
            self.test2.to_hdf(self.testHdfPath2, self.testHdfFile, format='t', index="False")
        
    def AddDemandaGeneralMean(self): 
        self.train.loc[:,"DemandaGeneralMean"] = self.train["Demanda_uni_equil"].loc[
            self.train.loc[:,'Semana'] < 10].mean().astype("float32")
        #self.train.loc[:,"DemandaGeneralMean"] = DemandaMeanWithoutLag['Demanda_uni_equil'].mean()
        #self.train.loc[:,"DemandaGeneralMean"] = self.train.loc[:,"DemandaGeneralMean"].astype('float32')
        #display(self.train)
        #del DemandaMeanWithoutLag
        gc.collect()
        
    '''ConfigElements(0,[ ("A",["Semana","Agencia_ID"],["count","count"]),'''
    def AddConfigurableFeaturesToTrain(self, config):
        if config.lag > self.maxLag:
            self.maxLag = config.lag
        
        tempData = self.train.loc[self.train.loc[:,'Semana'] <= self.ValidationEnd - config.lag]
        #display(tempData)
        if(config.lag != 0):
            tempData.loc[:,'Semana'] = tempData['Semana'].apply(lambda x:x + config.lag)
        #display(tempData)
        
        #Means iterative.. eliminate as long as np.nan exists..If there is already one, don't create, use the existing
        if config.targetVariable != "" and  config.targetVariable not in self.train.columns:
            self.train.loc[:,config.targetVariable] = np.nan
            self.test1.loc[:,config.targetVariable] = np.nan
            
            if config.lag != 1:
                self.test2.loc[:,config.targetVariable] = np.nan
        
        for name,groups,aggregate in config.nameAndGroups:
            if name not in self.train.columns:
                print "{} is not in columns..".format(name)            
                
                groupedDataframe = tempData[groups+['Demanda_uni_equil']].groupby(groups).agg(aggregate[0])
                #groupedDataframe.columns = groupedDataframe.columns.droplevel(0)
                groupedDataframe.columns = [name]
                
                #This is means of the counts of the semana-columns tuples!..!!!
                #If no lag and mean, mean of the columns without semana!!..
                #If there is lag and count, count of the columns x weeks before
                #If there is lag and mean, mean of the columns x weeks before
                #if(config.lag == 0 and aggregate == "count"):
                if(len(aggregate)>1):
                    groupedDataframe.reset_index(inplace=True)
                    groupedDataframe.drop("Semana",axis=1, inplace=True)
                    groups = groups[1:]
                    groupedDataframe = groupedDataframe.groupby(groups).agg(aggregate[1])
                    groupedDataframe.columns = [name]
                    gc.collect()
                
                display(groupedDataframe)
                self.train = self.train.merge( groupedDataframe, left_on=groups,
                    right_index=True, how='left', sort=False,copy=False)
                self.test1 = self.test1.merge( groupedDataframe, left_on=groups,
                    right_index=True, how='left', sort=False,copy=False)
                if config.lag != 1:
                    self.test2 = self.test2.merge( groupedDataframe, left_on=groups,
                        right_index=True, how='left', sort=False,copy=False)
                
                del groupedDataframe
                gc.collect()
            else:
                print "{} is in columns..".format(name)
            
            display(self.train)
            display(self.test1)
            display(self.test2)
            
            #Means iterative..!!!!!
            if config.targetVariable != "":
                self.train.loc[pd.isnull(self.train[config.targetVariable]), 
                    config.targetVariable] = self.train.loc[pd.isnull(self.train[config.targetVariable]), name]
                self.test1.loc[pd.isnull(self.test1[config.targetVariable]), 
                    config.targetVariable] = self.test1.loc[pd.isnull(self.test1[config.targetVariable]), name]
                if config.lag != 1:
                    self.test2.loc[pd.isnull(self.test2[config.targetVariable]), 
                        config.targetVariable] = self.test2.loc[pd.isnull(self.test2[config.targetVariable]), name]
                    
                count = self.test1.loc[:,config.targetVariable].isnull().sum()
                print "Count of missing numbers after {} in validation part 1 in column {} is {}".format(name, 
                    config.targetVariable,str(count))
                if config.lag != 1:
                    count = self.test2.loc[:,config.targetVariable].isnull().sum()
                    print "Count of missing numbers after {} in validation part 2 in column {} is {}".format(name, 
                        config.targetVariable,str(count))
                
                
                #display(self.train)
                #If column is already in Dataframe and we want to fill target variable, this deletes columns!!!
                if(config.deleteColumns):
                    self.train.drop(name, axis=1, inplace=True)
                    self.test1.drop(name, axis=1, inplace=True)
                    if config.lag != 1:
                        self.test2.drop(name, axis=1, inplace=True)
                gc.collect()
                #Only in tesst
                #if count == 0:
                 #   break
        del tempData
        display(self.train)   
        display(self.test1)   
        display(self.test2)
        gc.collect()
        return 
    
    def DeleteLaggedWeeksFromTrain(self,trainOrTestOrBoth):
        self.train = self.train.loc[self.train.loc[:,'Semana']>= 3 + self.maxLag]
        gc.collect()
        display(self.train.head(2))
        
    def ReadFirstNRowsOfACsv(self, nrows, trainOrTestOrBoth) :
        if trainOrTestOrBoth == 'train' or trainOrTestOrBoth=='both':
            self.train = pd.read_csv(self.trainCsvPath, usecols=self.trainTypes.keys(), dtype=self.trainTypes, nrows = nrows)
            FeatureEngineering.changeIndexTypeToLowerMemory(self.train)
            FeatureEngineering.__printDataFrameBasics__(self.train)
        if trainOrTestOrBoth == 'test' or trainOrTestOrBoth=='both':
            tempTest = pd.read_csv(self.testCsvPath, usecols=self.testTypes.keys(), dtype=self.testTypes, nrows = nrows*2)
            self.test1 = tempTest.loc[tempTest.Semana == self.ValidationStart]
            self.test2 = tempTest.loc[tempTest.Semana == self.ValidationEnd]
            del tempTest
            FeatureEngineering.changeIndexTypeToLowerMemory(self.test1)
            FeatureEngineering.changeIndexTypeToLowerMemory(self.test2)
            FeatureEngineering.__printDataFrameBasics__(self.test1)
            FeatureEngineering.__printDataFrameBasics__(self.test2)
    
    #Use when concatanating train and validation before predict test for example..
    def AppendTestToTrain(self,deleteTest = True):
        self.train = self.train.append(self.test1,ignore_index=True)
        gc.collect()
        if(deleteTest):
            del self.test1
            gc.collect()
        try:
            self.train = self.train.append(self.test2,ignore_index=True)
            gc.collect()
            if(deleteTest):
                del self.test2
                gc.collect()
        except:
            pass
    #Split train data to train and test1 and test2 (validation)
    def SplitTrainToTestUsingValidationStart(self):
        boolCondition = self.train.Semana == self.ValidationStart
        self.test1 = self.train.loc[boolCondition]
        self.train.drop((self.train.loc[boolCondition].index), axis=0,inplace=True)
        gc.collect()
        
        boolCondition = self.train.Semana == self.ValidationEnd
        self.test2 = self.train.loc[boolCondition]
        self.train.drop((self.train.loc[boolCondition].index), axis=0,inplace=True)
        
        del boolCondition
        gc.collect()

In [3]:
parameterDict =       {"ValidationStart":10, 
 "ValidationEnd":11,
   "maxLag":2,
    "trainHdfPath":'../../input/train.h5',
    "trainHdfFile":"train",
    "testHdfPath1":"../../input/test1.h5",
    "testHdfPath2":"../../input/test2.h5",
    "testHdfFile":"test", 
    "trainTypes" : {'Semana':np.uint8, 'Agencia_ID':np.uint16, 'Canal_ID':np.uint8,'Ruta_SAK':np.uint16, 
        'Cliente_ID':np.uint32, 'Producto_ID':np.uint16,'Venta_uni_hoy':np.uint16, 'Venta_hoy':np.float32,
                    'Dev_uni_proxima': np.uint32, 'Dev_proxima':np.float32,'Demanda_uni_equil':np.uint32}, 
    "testTypes" : {'id':np.uint32,'Semana':np.uint8, 'Agencia_ID':np.uint16, 'Canal_ID':np.uint8,'Ruta_SAK':np.uint16,
        'Cliente_ID':np.uint32, 'Producto_ID':np.uint16},
    "trainCsvPath":'../../input/train.csv'   ,
    "testCsvPath":'../../input/test.csv'}

FE = FeatureEngineering(**parameterDict)
print FE.__dict__

{'trainCsvPath': '../../input/train.csv', 'maxLag': 2, 'testTypes': {'Cliente_ID': <type 'numpy.uint32'>, 'Ruta_SAK': <type 'numpy.uint16'>, 'Canal_ID': <type 'numpy.uint8'>, 'Producto_ID': <type 'numpy.uint16'>, 'Agencia_ID': <type 'numpy.uint16'>, 'Semana': <type 'numpy.uint8'>, 'id': <type 'numpy.uint32'>}, 'testHdfFile': 'test', 'trainTypes': {'Dev_proxima': <type 'numpy.float32'>, 'Venta_uni_hoy': <type 'numpy.uint16'>, 'Cliente_ID': <type 'numpy.uint32'>, 'Demanda_uni_equil': <type 'numpy.uint32'>, 'Ruta_SAK': <type 'numpy.uint16'>, 'Canal_ID': <type 'numpy.uint8'>, 'Venta_hoy': <type 'numpy.float32'>, 'Producto_ID': <type 'numpy.uint16'>, 'Agencia_ID': <type 'numpy.uint16'>, 'Dev_uni_proxima': <type 'numpy.uint32'>, 'Semana': <type 'numpy.uint8'>}, 'testHdfPath1': '../../input/test1.h5', 'ValidationEnd': 11, 'testHdfPath2': '../../input/test2.h5', 'testCsvPath': '../../input/test.csv', 'ValidationStart': 10, 'trainHdfFile': 'train', 'trainHdfPath': '../../input/train.h5'}


In [4]:
#FE.ReadFirstNRowsOfACsv(10,'test')
FE.ReadCsv('train')


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil
0,3,1110,7,3301,15766,1212,3,25.139999,0,0.0,3
1,3,1110,7,3301,15766,1216,4,33.52,0,0.0,4


<class 'pandas.core.frame.DataFrame'>
Int64Index: 74180464 entries, 0 to 74180463
Data columns (total 11 columns):
Semana               uint8
Agencia_ID           uint16
Canal_ID             uint8
Ruta_SAK             uint16
Cliente_ID           uint32
Producto_ID          uint16
Venta_uni_hoy        uint16
Venta_hoy            float32
Dev_uni_proxima      uint32
Dev_proxima          float32
Demanda_uni_equil    uint32
dtypes: float32(2), uint16(4), uint32(3), uint8(2)
memory usage: 2.6 GB
None


In [5]:
#FE.test1.head()

In [6]:
townstate = pd.read_csv("../../input/town_state.csv", encoding='utf-8')
townstate['Town_ID']=townstate['Town'].str[:4]
states = townstate['State']
le = preprocessing.LabelEncoder()
townstate['State_ID']=le.fit_transform(states)
townstate = townstate.drop(['Town', 'State'], axis=1)
townstate = townstate.astype('uint16')
townstate[['State_ID']] =townstate[['State_ID']] .astype('uint8')
townstate.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 790 entries, 0 to 789
Data columns (total 3 columns):
Agencia_ID    790 non-null uint16
Town_ID       790 non-null uint16
State_ID      790 non-null uint8
dtypes: uint16(2), uint8(1)
memory usage: 3.9 KB


In [7]:
products = pd.read_csv("../../input/producto_tabla.csv")
products['short_name'] = products.NombreProducto.str.extract('^(\D*)', expand=False)
products['brand'] = products.NombreProducto.str.extract('^.+\s(\D+) \d+$', expand=False)
w = products.NombreProducto.str.extract('(\d+)(Kg|g)', expand=True)
products['weight'] = w[0].astype('float') * w[1].map({'Kg': 1000, 'g': 1})
products['pieces'] = products.NombreProducto.str.extract('(\d+)p ', expand=False).astype('float')

products['short_name_processed'] = (products['short_name'].
                                    map(lambda x: " ".
                                        join([i for i in x.lower().split() 
                                              if i not in nltk.corpus.stopwords.words("spanish")])))
stemmer = SnowballStemmer("spanish")
products['short_name_processed'] = (products['short_name_processed'].
                                    map(lambda x: " ".join([stemmer.stem(i) for i in x.lower().split()])))

le = preprocessing.LabelEncoder()

products['Prod_name_ID']=le.fit_transform(products['short_name_processed'])
products['Brand_ID']=le.fit_transform(products['brand'])

products = products.drop(['short_name', 'brand', 'short_name_processed', 'NombreProducto'], axis=1)
products.fillna(value=0, inplace=True)
products[['pieces','Brand_ID']] = products[['pieces','Brand_ID']].astype('uint8')
products[['Producto_ID','weight', 'Prod_name_ID']] = products[['Producto_ID','weight', 'Prod_name_ID']].astype('uint16')
products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2592 entries, 0 to 2591
Data columns (total 5 columns):
Producto_ID     2592 non-null uint16
weight          2592 non-null uint16
pieces          2592 non-null uint8
Prod_name_ID    2592 non-null uint16
Brand_ID        2592 non-null uint8
dtypes: uint16(3), uint8(2)
memory usage: 20.3 KB


In [35]:
#FE.test1 = pd.merge(FE.test1, products, on='Producto_ID', how='left')
#FE.test1 = pd.merge(FE.test1, townstate, on='Agencia_ID', how='left')

In [8]:
FE.train = pd.merge(FE.train, products, on='Producto_ID', how='left')
FE.train = pd.merge(FE.train, townstate, on='Agencia_ID', how='left')


In [10]:
FE.train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 74180464 entries, 0 to 74180463
Data columns (total 15 columns):
Semana               uint8
Agencia_ID           uint16
Canal_ID             uint8
Ruta_SAK             uint16
Cliente_ID           uint32
Producto_ID          uint16
Venta_uni_hoy        uint16
Venta_hoy            float32
Dev_uni_proxima      uint32
Dev_proxima          float32
Demanda_uni_equil    uint32
weight               uint16
pieces               uint8
Prod_name_ID         uint16
Brand_ID             uint8
dtypes: float32(2), uint16(6), uint32(3), uint8(4)
memory usage: 3.0 GB


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2592 entries, 0 to 2591
Data columns (total 5 columns):
Producto_ID     2592 non-null uint16
weight          2592 non-null uint16
pieces          2592 non-null uint8
Prod_name_ID    2592 non-null uint16
Brand_ID        2592 non-null uint8
dtypes: uint16(3), uint8(2)
memory usage: 20.3 KB


## Append test, memory is same??

In [9]:
FE.ReadFirstNRowsOfACsv(5,'train')
#FE.train.info()
#FE.test1.info()
FE.test1 = FE.train.copy()
FE.test1["Semana"] = np.array([10,11,10,11,10]).astype("uint8")
display(FE.train)
display(FE.test1)
FE.AppendTestToTrain()
display(FE.train)
FE.train.info()
FE.test1.info()
display(FE.train)
display(FE.test1)

Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil
0,3,1110,7,3303,324600,202,8,327.600006,0,0.0,8
1,3,1112,1,1604,327360,303,8,36.32,0,0.0,8


<class 'pandas.core.frame.DataFrame'>
Int64Index: 5 entries, 0 to 4
Data columns (total 11 columns):
Semana               5 non-null uint8
Agencia_ID           5 non-null uint16
Canal_ID             5 non-null uint8
Ruta_SAK             5 non-null uint16
Cliente_ID           5 non-null uint32
Producto_ID          5 non-null uint16
Venta_uni_hoy        5 non-null uint16
Venta_hoy            5 non-null float32
Dev_uni_proxima      5 non-null uint32
Dev_proxima          5 non-null float32
Demanda_uni_equil    5 non-null uint32
dtypes: float32(2), uint16(4), uint32(3), uint8(2)
memory usage: 190.0 bytes
None


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil
0,3,1110,7,3303,324600,202,8,327.600006,0,0.0,8
1,3,1112,1,1604,327360,303,8,36.32,0,0.0,8
2,3,1118,1,1409,81569,1309,1,6.76,0,0.0,1
3,3,1118,1,1409,81688,1242,7,53.48,0,0.0,7
4,3,1121,1,1417,118819,1064,6,100.019997,0,0.0,6


Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil
0,10,1110,7,3303,324600,202,8,327.600006,0,0.0,8
1,11,1112,1,1604,327360,303,8,36.32,0,0.0,8
2,10,1118,1,1409,81569,1309,1,6.76,0,0.0,1
3,11,1118,1,1409,81688,1242,7,53.48,0,0.0,7
4,10,1121,1,1417,118819,1064,6,100.019997,0,0.0,6


Unnamed: 0,Agencia_ID,Canal_ID,Cliente_ID,Demanda_uni_equil,Dev_proxima,Dev_uni_proxima,Producto_ID,Ruta_SAK,Semana,Venta_hoy,Venta_uni_hoy,id
0,1110,7,324600,8.0,0.0,0.0,202,3303,3,327.600006,8.0,
1,1112,1,327360,8.0,0.0,0.0,303,1604,3,36.32,8.0,
2,1118,1,81569,1.0,0.0,0.0,1309,1409,3,6.76,1.0,
3,1118,1,81688,7.0,0.0,0.0,1242,1409,3,53.48,7.0,
4,1121,1,118819,6.0,0.0,0.0,1064,1417,3,100.019997,6.0,
5,1110,7,324600,8.0,0.0,0.0,202,3303,10,327.600006,8.0,
6,1112,1,327360,8.0,0.0,0.0,303,1604,11,36.32,8.0,
7,1118,1,81569,1.0,0.0,0.0,1309,1409,10,6.76,1.0,
8,1118,1,81688,7.0,0.0,0.0,1242,1409,11,53.48,7.0,
9,1121,1,118819,6.0,0.0,0.0,1064,1417,10,100.019997,6.0,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 12 columns):
Agencia_ID           14 non-null uint16
Canal_ID             14 non-null uint8
Cliente_ID           14 non-null uint32
Demanda_uni_equil    10 non-null float64
Dev_proxima          10 non-null float64
Dev_uni_proxima      10 non-null float64
Producto_ID          14 non-null uint16
Ruta_SAK             14 non-null uint16
Semana               14 non-null uint8
Venta_hoy            10 non-null float64
Venta_uni_hoy        10 non-null float64
id                   4 non-null float64
dtypes: float64(6), uint16(3), uint32(1), uint8(2)
memory usage: 912.0 bytes


AttributeError: FeatureEngineering instance has no attribute 'test1'

## Split Train to train and validation..Control memory usage..

In [10]:
FE.SplitTrainToTestUsingValidationStart()
FE.train.info()
FE.test1.info()
FE.test2.info()
display(FE.train)
display(FE.test1)
display(FE.test2)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5 entries, 0 to 4
Data columns (total 12 columns):
Agencia_ID           5 non-null uint16
Canal_ID             5 non-null uint8
Cliente_ID           5 non-null uint32
Demanda_uni_equil    5 non-null float64
Dev_proxima          5 non-null float64
Dev_uni_proxima      5 non-null float64
Producto_ID          5 non-null uint16
Ruta_SAK             5 non-null uint16
Semana               5 non-null uint8
Venta_hoy            5 non-null float64
Venta_uni_hoy        5 non-null float64
id                   0 non-null float64
dtypes: float64(6), uint16(3), uint32(1), uint8(2)
memory usage: 340.0 bytes
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3 entries, 5 to 9
Data columns (total 12 columns):
Agencia_ID           3 non-null uint16
Canal_ID             3 non-null uint8
Cliente_ID           3 non-null uint32
Demanda_uni_equil    3 non-null float64
Dev_proxima          3 non-null float64
Dev_uni_proxima      3 non-null float64
Producto_ID  

Unnamed: 0,Agencia_ID,Canal_ID,Cliente_ID,Demanda_uni_equil,Dev_proxima,Dev_uni_proxima,Producto_ID,Ruta_SAK,Semana,Venta_hoy,Venta_uni_hoy,id
0,1110,7,324600,8.0,0.0,0.0,202,3303,3,327.600006,8.0,
1,1112,1,327360,8.0,0.0,0.0,303,1604,3,36.32,8.0,
2,1118,1,81569,1.0,0.0,0.0,1309,1409,3,6.76,1.0,
3,1118,1,81688,7.0,0.0,0.0,1242,1409,3,53.48,7.0,
4,1121,1,118819,6.0,0.0,0.0,1064,1417,3,100.019997,6.0,


Unnamed: 0,Agencia_ID,Canal_ID,Cliente_ID,Demanda_uni_equil,Dev_proxima,Dev_uni_proxima,Producto_ID,Ruta_SAK,Semana,Venta_hoy,Venta_uni_hoy,id
5,1110,7,324600,8.0,0.0,0.0,202,3303,10,327.600006,8.0,
7,1118,1,81569,1.0,0.0,0.0,1309,1409,10,6.76,1.0,
9,1121,1,118819,6.0,0.0,0.0,1064,1417,10,100.019997,6.0,


Unnamed: 0,Agencia_ID,Canal_ID,Cliente_ID,Demanda_uni_equil,Dev_proxima,Dev_uni_proxima,Producto_ID,Ruta_SAK,Semana,Venta_hoy,Venta_uni_hoy,id
6,1112,1,327360,8.0,0.0,0.0,303,1604,11,36.32,8.0,
8,1118,1,81688,7.0,0.0,0.0,1242,1409,11,53.48,7.0,
10,1631,1,766465,,,,35305,2802,11,,,6252.0
11,1342,1,2229028,,,,43251,1207,11,,,18978.0
12,1235,1,711302,,,,1220,1217,11,,,30799.0
13,1636,1,152283,,,,37569,4410,11,,,34135.0


## Preprocess, Log("Demanda..")

In [11]:
#FE.Preprocess('both', [["Demanda_uni_equil",np.log1p,'float32']])
display(FE.train)
display(FE.test1)
display(FE.test2)

Unnamed: 0,Agencia_ID,Canal_ID,Cliente_ID,Demanda_uni_equil,Dev_proxima,Dev_uni_proxima,Producto_ID,Ruta_SAK,Semana,Venta_hoy,Venta_uni_hoy,id
0,1110,7,324600,8.0,0.0,0.0,202,3303,3,327.600006,8.0,
1,1112,1,327360,8.0,0.0,0.0,303,1604,3,36.32,8.0,
2,1118,1,81569,1.0,0.0,0.0,1309,1409,3,6.76,1.0,
3,1118,1,81688,7.0,0.0,0.0,1242,1409,3,53.48,7.0,
4,1121,1,118819,6.0,0.0,0.0,1064,1417,3,100.019997,6.0,


Unnamed: 0,Agencia_ID,Canal_ID,Cliente_ID,Demanda_uni_equil,Dev_proxima,Dev_uni_proxima,Producto_ID,Ruta_SAK,Semana,Venta_hoy,Venta_uni_hoy,id
5,1110,7,324600,8.0,0.0,0.0,202,3303,10,327.600006,8.0,
7,1118,1,81569,1.0,0.0,0.0,1309,1409,10,6.76,1.0,
9,1121,1,118819,6.0,0.0,0.0,1064,1417,10,100.019997,6.0,


Unnamed: 0,Agencia_ID,Canal_ID,Cliente_ID,Demanda_uni_equil,Dev_proxima,Dev_uni_proxima,Producto_ID,Ruta_SAK,Semana,Venta_hoy,Venta_uni_hoy,id
6,1112,1,327360,8.0,0.0,0.0,303,1604,11,36.32,8.0,
8,1118,1,81688,7.0,0.0,0.0,1242,1409,11,53.48,7.0,
10,1631,1,766465,,,,35305,2802,11,,,6252.0
11,1342,1,2229028,,,,43251,1207,11,,,18978.0
12,1235,1,711302,,,,1220,1217,11,,,30799.0
13,1636,1,152283,,,,37569,4410,11,,,34135.0


## Save Dataframe to HDF..

In [12]:
FE.SaveDataFrameToHdf('both')

## Load from HDF, Test with big data!!!!

In [13]:
FE.ReadHdf('both')

Unnamed: 0,Agencia_ID,Canal_ID,Cliente_ID,Demanda_uni_equil,Dev_proxima,Dev_uni_proxima,Producto_ID,Ruta_SAK,Semana,Venta_hoy,Venta_uni_hoy,id
0,1110,7,324600,8.0,0.0,0.0,202,3303,3,327.600006,8.0,
1,1112,1,327360,8.0,0.0,0.0,303,1604,3,36.32,8.0,


<class 'pandas.core.frame.DataFrame'>
Int64Index: 5 entries, 0 to 4
Data columns (total 12 columns):
Agencia_ID           5 non-null uint16
Canal_ID             5 non-null uint8
Cliente_ID           5 non-null uint32
Demanda_uni_equil    5 non-null float64
Dev_proxima          5 non-null float64
Dev_uni_proxima      5 non-null float64
Producto_ID          5 non-null uint16
Ruta_SAK             5 non-null uint16
Semana               5 non-null uint8
Venta_hoy            5 non-null float64
Venta_uni_hoy        5 non-null float64
id                   0 non-null float64
dtypes: float64(6), uint16(3), uint32(1), uint8(2)
memory usage: 340.0 bytes
None


Unnamed: 0,Agencia_ID,Canal_ID,Cliente_ID,Demanda_uni_equil,Dev_proxima,Dev_uni_proxima,Producto_ID,Ruta_SAK,Semana,Venta_hoy,Venta_uni_hoy,id
5,1110,7,324600,8.0,0.0,0.0,202,3303,10,327.600006,8.0,
7,1118,1,81569,1.0,0.0,0.0,1309,1409,10,6.76,1.0,


<class 'pandas.core.frame.DataFrame'>
Int64Index: 3 entries, 5 to 9
Data columns (total 12 columns):
Agencia_ID           3 non-null uint16
Canal_ID             3 non-null uint8
Cliente_ID           3 non-null uint32
Demanda_uni_equil    3 non-null float64
Dev_proxima          3 non-null float64
Dev_uni_proxima      3 non-null float64
Producto_ID          3 non-null uint16
Ruta_SAK             3 non-null uint16
Semana               3 non-null uint8
Venta_hoy            3 non-null float64
Venta_uni_hoy        3 non-null float64
id                   0 non-null float64
dtypes: float64(6), uint16(3), uint32(1), uint8(2)
memory usage: 204.0 bytes
None


Unnamed: 0,Agencia_ID,Canal_ID,Cliente_ID,Demanda_uni_equil,Dev_proxima,Dev_uni_proxima,Producto_ID,Ruta_SAK,Semana,Venta_hoy,Venta_uni_hoy,id
6,1112,1,327360,8.0,0.0,0.0,303,1604,11,36.32,8.0,
8,1118,1,81688,7.0,0.0,0.0,1242,1409,11,53.48,7.0,


<class 'pandas.core.frame.DataFrame'>
Int64Index: 6 entries, 6 to 13
Data columns (total 12 columns):
Agencia_ID           6 non-null uint16
Canal_ID             6 non-null uint8
Cliente_ID           6 non-null uint32
Demanda_uni_equil    2 non-null float64
Dev_proxima          2 non-null float64
Dev_uni_proxima      2 non-null float64
Producto_ID          6 non-null uint16
Ruta_SAK             6 non-null uint16
Semana               6 non-null uint8
Venta_hoy            2 non-null float64
Venta_uni_hoy        2 non-null float64
id                   4 non-null float64
dtypes: float64(6), uint16(3), uint32(1), uint8(2)
memory usage: 408.0 bytes
None


In [14]:
FE.train.Demanda_uni_equil = FE.train.Demanda_uni_equil.astype("uint32")

In [15]:
FE.train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5 entries, 0 to 4
Data columns (total 12 columns):
Agencia_ID           5 non-null uint16
Canal_ID             5 non-null uint8
Cliente_ID           5 non-null uint32
Demanda_uni_equil    5 non-null uint32
Dev_proxima          5 non-null float64
Dev_uni_proxima      5 non-null float64
Producto_ID          5 non-null uint16
Ruta_SAK             5 non-null uint16
Semana               5 non-null uint8
Venta_hoy            5 non-null float64
Venta_uni_hoy        5 non-null float64
id                   0 non-null float64
dtypes: float64(5), uint16(3), uint32(2), uint8(2)
memory usage: 320.0 bytes


## PROD Load train and test CSV, preprocess and save as HDF..

In [16]:
FE.ReadCsv('both')
FE.Preprocess('train', [["Demanda_uni_equil",np.log1p,'float32']])
FE.SaveDataFrameToHdf('both')

Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil
0,3,1110,7,3303,324600,202,8,327.600006,0,0.0,8
1,3,1112,1,1604,327360,303,8,36.32,0,0.0,8


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 11 columns):
Semana               1000 non-null uint8
Agencia_ID           1000 non-null uint16
Canal_ID             1000 non-null uint8
Ruta_SAK             1000 non-null uint16
Cliente_ID           1000 non-null uint32
Producto_ID          1000 non-null uint16
Venta_uni_hoy        1000 non-null uint16
Venta_hoy            1000 non-null float32
Dev_uni_proxima      1000 non-null uint32
Dev_proxima          1000 non-null float32
Demanda_uni_equil    1000 non-null uint32
dtypes: float32(2), uint16(4), uint32(3), uint8(2)
memory usage: 37.1 KB
None


Unnamed: 0,id,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID
0,2107,10,1124,1,2136,184044,31588
1,4750,10,1155,4,6607,2385912,1145


<class 'pandas.core.frame.DataFrame'>
Int64Index: 516 entries, 0 to 999
Data columns (total 7 columns):
id             516 non-null uint32
Semana         516 non-null uint8
Agencia_ID     516 non-null uint16
Canal_ID       516 non-null uint8
Ruta_SAK       516 non-null uint16
Cliente_ID     516 non-null uint32
Producto_ID    516 non-null uint16
dtypes: uint16(3), uint32(2), uint8(2)
memory usage: 12.1 KB
None


Unnamed: 0,id,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID
2,6252,11,1631,1,2802,766465,35305
3,18978,11,1342,1,1207,2229028,43251


<class 'pandas.core.frame.DataFrame'>
Int64Index: 484 entries, 2 to 998
Data columns (total 7 columns):
id             484 non-null uint32
Semana         484 non-null uint8
Agencia_ID     484 non-null uint16
Canal_ID       484 non-null uint8
Ruta_SAK       484 non-null uint16
Cliente_ID     484 non-null uint32
Producto_ID    484 non-null uint16
dtypes: uint16(3), uint32(2), uint8(2)
memory usage: 11.3 KB
None


## Load train and test HDF!!

In [17]:
FE.ReadHdf('test')

Unnamed: 0,id,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID
0,2107,10,1124,1,2136,184044,31588
1,4750,10,1155,4,6607,2385912,1145


<class 'pandas.core.frame.DataFrame'>
Int64Index: 516 entries, 0 to 999
Data columns (total 7 columns):
id             516 non-null uint32
Semana         516 non-null uint8
Agencia_ID     516 non-null uint16
Canal_ID       516 non-null uint8
Ruta_SAK       516 non-null uint16
Cliente_ID     516 non-null uint32
Producto_ID    516 non-null uint16
dtypes: uint16(3), uint32(2), uint8(2)
memory usage: 12.1 KB
None


Unnamed: 0,id,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID
2,6252,11,1631,1,2802,766465,35305
3,18978,11,1342,1,1207,2229028,43251


<class 'pandas.core.frame.DataFrame'>
Int64Index: 484 entries, 2 to 998
Data columns (total 7 columns):
id             484 non-null uint32
Semana         484 non-null uint8
Agencia_ID     484 non-null uint16
Canal_ID       484 non-null uint8
Ruta_SAK       484 non-null uint16
Cliente_ID     484 non-null uint32
Producto_ID    484 non-null uint16
dtypes: uint16(3), uint32(2), uint8(2)
memory usage: 11.3 KB
None


In [18]:
FE.test1.reset_index(inplace=True)
FE.test1.drop("index",inplace=True,axis=1)

In [19]:
FE.test1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 516 entries, 0 to 515
Data columns (total 7 columns):
id             516 non-null uint32
Semana         516 non-null uint8
Agencia_ID     516 non-null uint16
Canal_ID       516 non-null uint8
Ruta_SAK       516 non-null uint16
Cliente_ID     516 non-null uint32
Producto_ID    516 non-null uint16
dtypes: uint16(3), uint32(2), uint8(2)
memory usage: 8.1 KB


In [20]:
FE.ReadFirstNRowsOfACsv(20,'train')

Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil
0,3,1110,7,3303,324600,202,8,327.600006,0,0.0,8
1,3,1112,1,1604,327360,303,8,36.32,0,0.0,8


<class 'pandas.core.frame.DataFrame'>
Int64Index: 20 entries, 0 to 19
Data columns (total 11 columns):
Semana               20 non-null uint8
Agencia_ID           20 non-null uint16
Canal_ID             20 non-null uint8
Ruta_SAK             20 non-null uint16
Cliente_ID           20 non-null uint32
Producto_ID          20 non-null uint16
Venta_uni_hoy        20 non-null uint16
Venta_hoy            20 non-null float32
Dev_uni_proxima      20 non-null uint32
Dev_proxima          20 non-null float32
Demanda_uni_equil    20 non-null uint32
dtypes: float32(2), uint16(4), uint32(3), uint8(2)
memory usage: 760.0 bytes
None


In [None]:
#FE.train.to_csv('../../input/train_20.csv', index= False)

## Test Setup

In [None]:
parameterDict = {"ValidationStart":8, 
    "ValidationEnd":9,
    "maxLag":2,    
    "trainHdfPath":'../../input/train_100.h5',
    "trainHdfFile":"train",
    "testHdfPath1":"../../input/train_100_1.h5",
    "testHdfPath2":"../../input/train_100_1.h5",
    "testHdfFile":"test", 
    "trainTypes" : {'Semana':np.uint8, 'Agencia_ID':np.uint16, 'Canal_ID':np.uint8,'Ruta_SAK':np.uint16, 
        'Cliente_ID':np.uint32, 'Producto_ID':np.uint16,'Venta_uni_hoy':np.uint16, 'Venta_hoy':np.float32,
                    'Dev_proxima':np.float32,'Demanda_uni_equil':np.uint32}, 
    "testTypes" : {'id':np.uint32,'Semana':np.uint8, 'Agencia_ID':np.uint16, 'Canal_ID':np.uint8,'Ruta_SAK':np.uint16,
        'Cliente_ID':np.uint32, 'Producto_ID':np.uint16},
    "trainCsvPath":'../../input/train_100.csv'   ,
    "testCsvPath":'../../input/test_100.csv'}

FE = FeatureEngineering(**parameterDict)
print FE.__dict__

In [None]:
FE.ReadCsv('train')
FE.Preprocess('train', [["Demanda_uni_equil",np.log1p,'float32']])
FE.SaveDataFrameToHdf('train')

In [None]:
FE.ReadHdf('train')
FE.SplitTrainToTestUsingValidationStart()

In [None]:
display(FE.test1.head(2))
display(FE.test2.head(2))
display(FE.train.head(2))

## Test1

In [None]:
FE.AddConfigurableFeaturesToTrain(configLag0Target1)

## Test 2

In [None]:
FE.AddConfigurableFeaturesToTrain(configLag2Target1 )

## Test 3

In [None]:
FE.AddConfigurableFeaturesToTrain(configLag1Target1)

## Test 4

In [None]:
FE.AddConfigurableFeaturesToTrain(configLag1Target1F)

## Test setup..

In [None]:
parameterDict = {"ValidationStart":6, 
    "ValidationEnd":7,
    "maxLag":2,    
    "trainHdfPath":'../../input/train_100.h5',
    "trainHdfFile":"train",
    "testHdfPath":"../../input/test.h5",
    "testHdfFile":"test", 
    "trainTypes" : {'Semana':np.uint8, 'Agencia_ID':np.uint16, 'Canal_ID':np.uint8,'Ruta_SAK':np.uint16, 
        'Cliente_ID':np.uint32, 'Producto_ID':np.uint16,'Venta_uni_hoy':np.uint16, 'Venta_hoy':np.float32,
                    'Dev_proxima':np.float32,'Demanda_uni_equil':np.uint32}, 
    "testTypes" : {'id':np.uint32,'Semana':np.uint8, 'Agencia_ID':np.uint16, 'Canal_ID':np.uint8,'Ruta_SAK':np.uint16,
        'Cliente_ID':np.uint32, 'Producto_ID':np.uint16},
    "trainCsvPath":'../../input/train_100.csv'   ,
    "testCsvPath":'../../input/test.csv'}

FE = FeatureEngineering(**parameterDict)
print FE.__dict__

In [None]:
#FE.ReadCsv('train')
#FE.Preprocess('train', [["Demanda_uni_equil",np.log1p,'float32']])
#FE.SaveDataFrameToHdf('train')

In [None]:
FE.ReadHdf('train')

In [None]:
display(FE.AddDemandaGeneralMean())

In [None]:
qw = FE.train.loc[FE.train.loc[:,'Semana']>4]
qw

In [None]:
FE.train.loc[:,'target'] = np.nan
FE.train

In [None]:
FE.train.loc[pd.isnull(FE.train['target']), 'target'] = FE.train.loc[pd.isnull(FE.train['target']), 'lag2P']
FE.train["target"]

In [None]:
FE.train.loc[FE.train['Semana']>4,"target"].isnull().sum()

In [None]:
FE.train.loc[FE.train['Semana']>4,"target"].isnull().sum()

In [None]:
FE.train.loc[pd.isnull(FE.train['target']), 'target'] = FE.train.loc[pd.isnull(FE.train['target']), 'lag2Cl']
FE.train["target"]

In [None]:
FE.train.loc[FE.train['Semana']>4,"target"].isnull().sum()

In [None]:
"Semana" not in FE.train.columns

In [None]:
FE.train

In [None]:
print "{} haha {}".format(1,2)

In [None]:
for i in FE.train.groupby('Canal_ID')['Demanda_uni_equil']:
    print i

In [None]:
FE.train[['Canal_ID','Demanda_uni_equil']]

In [None]:
["Cliente_ID","Producto_ID"] + 'abc'

In [21]:
import xgboost as xgb

params = {}
params['objective'] = "reg:linear"
params['eta'] = 0.1
params['max_depth'] = 5
params['subsample'] = 0.8
params['colsample_bytree'] = 0.6
params['silent'] = True
'alpha': 0.0001, 'lambda': 1

SyntaxError: invalid syntax (<ipython-input-21-818fa5b8aef9>, line 10)

In [None]:
import xgboost as xgb

In [None]:
regres = xgb.XGBRegressor()

In [None]:
regres