# Bimbo Group - Inventory Demand Challenge - Merged and Cleaned Datasets

In [11]:
# IMPORT PACKAGES (SKLEARN, PANDAS, NUMPY, MATPLOTLIB)
from sklearn import datasets
from sklearn import metrics
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

In [43]:
initialize = False # True if you want to create trainsample
path = './data/'
train_data = pd.read_csv(path+'trainsample.csv', low_memory=False)


In [44]:
# Create trainsample with only 10% of the total dataset --> 74k out of 7.4 million
if initialize:
    print('initializing trainsample...')
    train_data = pd.read_csv('./data/train.csv', low_memory=False)
    train_sample = train_data.sample(frac=0.1, random_state=42)
    train_sample.to_csv('./data/trainsample.csv', index=False)

In [45]:
train_data.shape

(7418046, 11)

In [46]:
#Remove First to Columns
#train_data = train_data.iloc[:,2:]

In [47]:
train_data

Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil
0,4,1441,1,5533,7701781,45111,4,35.92,0,0.0,4
1,6,2229,1,1101,312220,3270,7,73.29,0,0.0,7
2,7,1945,1,8013,877789,2425,15,67.50,0,0.0,15
3,6,2012,1,1159,7878703,1309,3,20.28,0,0.0,3
4,5,2054,1,1125,283175,47840,10,80.00,0,0.0,10
...,...,...,...,...,...,...,...,...,...,...,...
7418041,8,1239,1,1029,993667,3144,2,42.64,0,0.0,2
7418042,9,2275,4,6626,913571,8931,27,310.50,0,0.0,27
7418043,8,1331,1,1004,382637,3631,1,16.35,0,0.0,1
7418044,6,1232,1,1209,2070944,41938,2,19.82,0,0.0,2


In [48]:
# Verify Data
train_data.columns

Index(['Semana', 'Agencia_ID', 'Canal_ID', 'Ruta_SAK', 'Cliente_ID',
       'Producto_ID', 'Venta_uni_hoy', 'Venta_hoy', 'Dev_uni_proxima',
       'Dev_proxima', 'Demanda_uni_equil'],
      dtype='object')

In [49]:
# Rename Columns
train_data.columns = ['Week_number','Sales_Depot_ID','Sales_Channel_ID','Route_ID','Client_ID',
'Product_ID','Weekly_Sales_Units','Weekly_Sales', 'Returns_Units_Next_Week_Int',
                      'Returns_Units_Next_Week_Pesos','Adjusted_Demand']

In [50]:
#Train_data: View first 10 rows of data
train_data.head(10)

Unnamed: 0,Week_number,Sales_Depot_ID,Sales_Channel_ID,Route_ID,Client_ID,Product_ID,Weekly_Sales_Units,Weekly_Sales,Returns_Units_Next_Week_Int,Returns_Units_Next_Week_Pesos,Adjusted_Demand
0,4,1441,1,5533,7701781,45111,4,35.92,0,0.0,4
1,6,2229,1,1101,312220,3270,7,73.29,0,0.0,7
2,7,1945,1,8013,877789,2425,15,67.5,0,0.0,15
3,6,2012,1,1159,7878703,1309,3,20.28,0,0.0,3
4,5,2054,1,1125,283175,47840,10,80.0,0,0.0,10
5,5,1123,1,1459,2390508,1284,3,9.06,0,0.0,3
6,9,2032,1,2859,1289871,35305,8,50.0,0,0.0,8
7,6,2034,1,1052,611918,1160,1,18.86,0,0.0,1
8,7,1335,1,1052,187013,1160,2,37.72,1,18.86,1
9,6,2020,1,1002,254537,1109,2,30.02,0,0.0,2


In [51]:
#Train_data: Shape
train_data.shape

(7418046, 11)

In [52]:
# SUMMARY STATISTICS FOR TRAIN DATASET:
train_data.iloc[:,6:].describe()

Unnamed: 0,Weekly_Sales_Units,Weekly_Sales,Returns_Units_Next_Week_Int,Returns_Units_Next_Week_Pesos,Adjusted_Demand
count,7418046.0,7418046.0,7418046.0,7418046.0,7418046.0
mean,7.314033,68.51465,0.1299957,1.260893,7.228798
std,22.16072,307.4494,5.431801,47.9841,21.95426
min,0.0,0.0,0.0,0.0,0.0
25%,2.0,16.76,0.0,0.0,2.0
50%,3.0,30.0,0.0,0.0,3.0
75%,7.0,56.1,0.0,0.0,6.0
max,4983.0,151925.8,9765.0,93744.0,4983.0


In [53]:
#Import and Rename Client Table data --> all 935k
client_table = pd.read_csv(path+'cliente_tabla.csv', low_memory=False)
client_table.columns = ['Client_ID', 'Client_Name']
print (client_table.head(10))

   Client_ID                              Client_Name
0          0                               SIN NOMBRE
1          1                         OXXO XINANTECATL
2          2                               SIN NOMBRE
3          3                                EL MORENO
4          4  SDN SER  DE ALIM  CUERPO SA CIA  DE INT
5          4     SDN SER DE ALIM CUERPO SA CIA DE INT
6          5                               LA VAQUITA
7          6                                   LUPITA
8          7                             I M EL GUERO
9          8                     MINI SUPER LOS LUPES


In [54]:
client_table.shape

(935362, 2)

In [55]:
# CODE FROM KERNEL: MAKE ALL STRING VALUES UPPER CASE
client_table['Client_Name'] = client_table['Client_Name'].str.upper()

In [56]:
# CODE FROM KERNEL: VALUE COUNTS ON FIRST 200 ROWS
client_table['Client_Name'].value_counts()[0:200]

Client_Name
NO IDENTIFICADO    281670
LUPITA               4863
MARY                 3016
LA PASADITA          2426
LA VENTANITA         2267
                    ...  
ORTIZ                 239
RIVERA                238
LA CURVA              238
TANIA                 238
JUAREZ                236
Name: count, Length: 200, dtype: int64

In [59]:
# TF-IDF Score List:
# Input: client_table (dataframe), list_len (?)
# Output: ?
def tfidf_score_list(client_table, list_len):
    # Import TfidfVectorizer function from sklearn.feature_extraction.text
    from sklearn.feature_extraction.text import TfidfVectorizer
    
    # v <- TfidfVectorizer 
    v = TfidfVectorizer()

    # "New" column erted into client_table (df)
    client_table['New'] = 'na'
    # Joins "Client Name" on empty df
    a = " ".join(client_table['Client_Name'])
    # Renames new column with values from  "Client Name" 
    client_table['New'][0] = a

    # TFIDF - Fit  Transform on client_table['New'] 
    tfidf = v.fit_transform(client_table['New'])

    # define feature names 
    feature_names = v.get_feature_names_out()
    
    # define freq as empty array
    freq = []
    # define object doc equal to 0.
    doc = 0
    #  define feature index as first row of TFIDT array
    # array has been sorted for nonzero ?? --> figure out shape of TFIDF array
    feature_index = tfidf[doc,:].nonzero()[1]
    tfidf_scores = zip(feature_index, [tfidf[doc, x] for x in feature_index])
    for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:
            freq.append((w.encode('utf-8'),s))
    
    del client_table['New']
    
    import numpy as np
    names = ['word','score']
    formats = ['S50','f8']
    dtype = dict(names = names, formats=formats)
    array = np.array(freq, dtype=dtype)

    b = np.sort(array, order='score')
    
    if list_len > len(b)+1:
        list_len = len(b)+1
    for i in range(1,list_len):
        print(b[-i])

In [60]:
# PRINT TFIDF SCORE LIST:
tfidf_score_list(client_table, 200)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  client_table['New'][0] = a


(b'no', 0.6888126)
(b'identificado', 0.68492922)
(b'la', 0.14990532)
(b'el', 0.08328479)
(b'abarrotes', 0.08000932)
(b'de', 0.06076993)
(b'maria', 0.04681942)
(b'miscelanea', 0.03805081)
(b'super', 0.03520332)
(b'los', 0.02841653)
(b'san', 0.02574169)
(b'oxxo', 0.02257322)
(b'del', 0.02166863)
(b'garcia', 0.02137683)
(b'hernandez', 0.02099506)
(b'jose', 0.01983515)
(b'lupita', 0.01926128)
(b'gonzalez', 0.0177585)
(b'martinez', 0.01764665)
(b'lopez', 0.01715059)
(b'mini', 0.0153317)
(b'rodriguez', 0.01489156)
(b'las', 0.01388971)
(b'don', 0.01254257)
(b'comodin', 0.01238451)
(b'guadalupe', 0.01235776)
(b'mary', 0.01227022)
(b'jesus', 0.01202462)
(b'ramirez', 0.01199544)
(b'juan', 0.01196626)
(b'casa', 0.01192979)
(b'ag', 0.01182036)
(b'sanchez', 0.01156747)
(b'perez', 0.01119299)
(b'farmacia', 0.01111518)
(b'hermanos', 0.01088173)
(b'cruz', 0.01042458)
(b'flores', 0.0096197)
(b'mi', 0.00899233)
(b'rosa', 0.00867621)
(b'carmen', 0.00854976)
(b'diconsa', 0.00843547)
(b'papeleria', 0.00820

In [61]:
# PRINT ELEMENTS OF CLIENT_NAMES LIST THAT CONTAIN *CAFE*
print(client_table[client_table['Client_Name'].str.contains('.*CAFE.*')])

        Client_ID                     Client_Name
78           1438           CAFETRIA PREPARATORIA
1095         5045   CAFETERIA DE LA SECUNDARIA 13
1098         5048               CAFETERIA PREPA 2
1233         5416                       CAFETERIA
1318         5612  CAFETERIA NORMAL DE PROFESORES
...           ...                             ...
934488    9678492         CAFETERIA LA CASA VIEJA
934720    9693686        SECUNDARIA 7 CAFETERIA 2
934786    9702948               NUEVO CAFE AZTECA
934942    9711388                 CAFETERIA SALOM
935214    9746888               CAFETERIA LA VACA

[2508 rows x 2 columns]


In [62]:
# FILTER CLIENT NAME VALUES

def filter_specific(vf2):
    client_table['Client_Name'] = client_table['Client_Name'].str.replace('.*REMISION.*','Consignment')
    client_table['Client_Name'] = client_table['Client_Name'].replace(['.*WAL MART.*','.*SAMS CLUB.*'],'Walmart', regex=True)
    client_table['Client_Name'] = client_table['Client_Name'].str.replace('.*OXXO.*','Oxxo Store')
    client_table['Client_Name'] = client_table['Client_Name'].str.replace('.*CONASUPO.*','Govt Store')
    client_table['Client_Name'] = client_table['Client_Name'].str.replace('.*BIMBO.*','Bimbo Store')
    client_table['Client_Name'] = client_table['Client_Name'].replace(['.*COLEG.*','.*UNIV.*','.*ESCU.*','.*INSTI.*',\
                                                        '.*PREPAR.*'],'School', regex=True)
    client_table['Client_Name'] = client_table['Client_Name'].str.replace('.*PUESTO.*','Post')
    client_table['Client_Name'] = client_table['Client_Name'].replace(['.*FARMA.*','.*HOSPITAL.*','.*CLINI.*'],'Hospital/Pharmacy', regex=True)
    client_table['Client_Name'] = client_table['Client_Name'].replace(['.*CAFE.*','.*CREMERIA.*','.*DULCERIA.*',\
                                                        '.*REST.*','.*BURGER.*','.*TACO.*', '.*TORTA.*',\
                                                        '.*TAQUER.*','.*HOT DOG.*',\
                                                        '.*COMEDOR.*', '.*ERIA.*','.*BURGU.*'],'Eatery', regex=True)
    client_table['Client_Name'] = client_table['Client_Name'].str.replace('.*SUPER.*','Supermarket')
    client_table['Client_Name'] = client_table['Client_Name'].replace(['.*COMERCIAL.*','.*BODEGA.*','.*DEPOSITO.*',\
                                                            '.*ABARROTES.*','.*MERCADO.*','.*CAMBIO.*',\
                                                        '.*MARKET.*','.*MART .*','.*MINI .*',\
                                                        '.*PLAZA.*','.*MISC.*','.*ELEVEN.*','.*EXP.*',\
                                                         '.*SNACK.*', '.*PAPELERIA.*', '.*CARNICERIA.*',\
                                                         '.*LOCAL.*','.*COMODIN.*','.*PROVIDENCIA.*'
                                                        ],'General Market/Mart'\
                                                       , regex=True)
    client_table['Client_Name'] = client_table['Client_Name'].replace(['.*VERDU.*','.*FRUT.*'],'Fresh Market', regex=True)
    client_table['Client_Name'] = client_table['Client_Name'].replace(['.*HOTEL.*','.*MOTEL.*'],'Hotel', regex=True)

In [63]:
# CALL FILTER_SPECIFIC FUNCTION
filter_specific(client_table)

In [64]:
# --- Begin filtering for more general terms
# The idea here is to look for names with particles of speech that would
# not appear in a person's name.
# i.e. "Individuals" should not contain any participles or numbers in their names.
def filter_participle(client_table):
    client_table['Client_Name'] = client_table['Client_Name'].replace([
            '.*LA .*','.*EL .*','.*DE .*','.*LOS .*','.*DEL .*','.*Y .*', '.*SAN .*', '.*SANTA .*',\
            '.*AG .*','.*LAS .*','.*MI .*','.*MA .*', '.*II.*', '.*[0-9]+.*'\
    ],'Small Franchise', regex=True)

In [65]:
# CALL FILTER_PARTICIPLE FUNCTION
filter_participle(client_table)

In [66]:
# Any remaining entries should be "Individual" Named Clients, there are some outliers.
# More specific filters could be used in order to reduce the percentage of outliers in this final set.
def filter_remaining(client_table):
    def function_word(data):
        # Avoid the single-words created so far by checking for upper-case
        if (data.isupper()) and (data != "NO IDENTIFICADO"): 
            return 'Individual'
        else:
            return data
    client_table['Client_Name'] = client_table['Client_Name'].map(function_word)

In [67]:
# CALL FILTER_REMAINING FUNCTION
filter_remaining(client_table)

In [68]:
# CALL VALUE COUNTS ON CLIENT NAME COLUMN
client_table['Client_Name'].value_counts()

Client_Name
Individual             367668
NO IDENTIFICADO        281670
Small Franchise        167880
General Market/Mart     73544
Eatery                  30566
Hospital/Pharmacy        5841
School                   5765
Hotel                    1129
Fresh Market             1079
Walmart                   220
Name: count, dtype: int64

In [69]:
client_table.to_csv(path+"Client_Table_Filtered.csv", index=False)

In [72]:
#Import Product Table --> 2.6k
product_table = pd.read_csv(path+'producto_tabla.csv', low_memory=False)
product_table.columns = ['Product_ID', 'Product_Name']
product_table.count()

Product_ID      2592
Product_Name    2592
dtype: int64

In [76]:
#Import Town_State table --> all
town_state = pd.read_csv(path+'town_state.csv', low_memory=False)

town_state.rename(columns = {'Agencia_ID':'Sales_Depot_ID'}, inplace=True)
print (town_state.head(100))

    Sales_Depot_ID                        Town             State
0             1110          2008 AG. LAGO FILT      MÉXICO, D.F.
1             1111       2002 AG. AZCAPOTZALCO      MÉXICO, D.F.
2             1112         2004 AG. CUAUTITLAN  ESTADO DE MÉXICO
3             1113          2008 AG. LAGO FILT      MÉXICO, D.F.
4             1114        2029 AG.IZTAPALAPA 2      MÉXICO, D.F.
..             ...                         ...               ...
95            1335       2264 MANZANILLO BIMBO            COLIMA
96            1336  2269 PUERTO VALLARTA BIMBO           JALISCO
97            1337        2277 ZACATECAS BIMBO         ZACATECAS
98            1338       2274 TEPATITLAN BIMBO           JALISCO
99            1339          2267 OCOTLAN BIMBO           JALISCO

[100 rows x 3 columns]


In [77]:
#Table1: Merge train with client
table1 = pd.merge(train_data, client_table, how = 'inner', on='Client_ID')

In [78]:
#Table2: Merge table1 with product
table2 = pd.merge(table1, product_table, how='inner', on='Product_ID')

In [79]:
#Table3: Merge table2
table3 = pd.merge(table2, town_state, how='inner', on='Sales_Depot_ID')

In [81]:
table3.to_csv(path+"SampleBimbo.csv", index=False)