############################################################################################################################
# Método 1: Probabilidad bayesiana 

# Consideraciones:
Previamente al cálculo se ha considerado que las siguientes variables no juegan un papel claro en este método: Socio_Demo_01 (edad), Socio_Demo_02 (antigüedad), Socio_Demo_04 (sexo) y Socio_Demo_05 (sector). Así que se ha descartado utilizarlas.


De esta forma, se ha considerado oportuno clasificar los clientes el banco entre cuatro distintas categorías:

- Categoría 1: clientes del 20% con ingresos altos. 

- Categoría 2: clientes del 20% con ingresos medios o bajos.

- Categoría 3: clientes del 80% con ingresos altos.

- Categoría 4: clientes del 80% con ingresos medios o bajos. 

# Método:

En este modelo simple solamente nos fijamos en el último producto contratado para predecir el siguiente. Así, si un cliente ha comprado un producto B, tenemos que calcular la probabilidad de que después de B compre cualquier de los otros productos, y quedarnos con el que tenga una probabilidad más alta.

Así, solo tenemos que contar el número de veces que sale el producto B con el producto A y el número de veces que sale el producto B.

$$ P(A | B) = \frac{P(B | A)P(A)}{P(B)} =  \frac{\frac{P(B∩A)}{P(A)}P(A)}{P(B)} = \frac{P(B∩A)}{P(B)}$$


Como hemos clasificado los usuarios en 4 categorías, primero tendremos que generar un vector con los dos últimos productos comprados por cada usuario en cada categoría. Una vez hecho esto, y para cada usuario de test, tendremos que clasificarlo en una de las 4 categorías dependiendo del número de productos que tiene y de los ingresos. Luego, cogeremos el último producto contratado y calcularemos las probabilidades del siguiente producto utilizando la fórmula de Bayes con el dataset de train de la categoría previamente seleccionada.

# Código comentado:

In [81]:
import pandas as pd
from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt
import numpy as np
# IMPORT TRAIN DATA
data = pd.read_csv('train2.txt', header = 0, delimiter='|')

In [82]:
#How many different products do we have?
different_produts = data.groupby('Cod_Prod').Cod_Prod.nunique()

#First we will separate our data between 20% and 80% users:
data_ID_byProd = data.groupby('ID_Customer').size().sort_values(ascending=False, )
IDs_20 = data_ID_byProd.head(np.int(data_ID_byProd.size*0.2+1))
IDs_80 = data_ID_byProd.tail(np.int(data_ID_byProd.size*0.8-1))
ID20_idx = np.in1d(data.ID_Customer.values, IDs_20.index)
data_IDs_20 = data.ix[ID20_idx]
ID80_idx = np.in1d(data.ID_Customer.values, IDs_80.index)
data_IDs_80 = data.ix[ID80_idx]

#Let's create an array with all the different users that we have:
data_users = data.groupby('ID_Customer').ID_Customer.nunique()

#Now we will divide our data into 4 categories: 
# 1 - Users that buy a lot of producs (20%) with high income (Socio_Demo_03 > 3)
# 2 - Users that buy a lot of products (20%) with midium or low income (Socio_Demo_03 <= 3)
# 3 - Users that buy a few products (80%) with high income (Socio_Demo_03 > 3)
# 4 - Users that buy a few products (80%) with medium or low income (Socio_Demo_03 <= 3)

#The new data will be:

# Category 1:
users = data_users.sample(10000)
data_20_high_income = data_IDs_20[(data_IDs_20.Socio_Demo_03 > 3)]
data_20_high_income = data_20_high_income[data_20_high_income.ID_Customer.isin(users.index[:])]
unique_users_20_high_income = data_20_high_income.groupby('ID_Customer').ID_Customer.nunique()

# Category 2:
users = data_users.sample(10000)
data_20_low_income = data_IDs_20[(data_IDs_20.Socio_Demo_03 <= 3)]
data_20_low_income = data_20_low_income[data_20_low_income.ID_Customer.isin(users.index[:])]
unique_users_20_low_income = data_20_low_income.groupby('ID_Customer').ID_Customer.nunique()

#Cateogry 3:
users = data_users.sample(10000)
data_80_high_income = data_IDs_80[(data_IDs_80.Socio_Demo_03 > 3)]
data_80_high_income = data_80_high_income[data_80_high_income.ID_Customer.isin(users.index[:])]
unique_users_80_high_income = data_80_high_income.groupby('ID_Customer').ID_Customer.nunique()

# Category 4:
users = data_users.sample(10000)
data_80_low_income = data_IDs_80[(data_IDs_80.Socio_Demo_03 <= 3)]
data_80_low_income = data_80_low_income[data_80_low_income.ID_Customer.isin(users.index[:])]
unique_users_80_low_income = data_80_low_income.groupby('ID_Customer').ID_Customer.nunique()

In [83]:
#To check how well our model work we will select other random users to test:

users = data_users.sample(1000)
data_test = data[data.ID_Customer.isin(users.index[:])]
unique_users_test = data_test.groupby('ID_Customer').ID_Customer.nunique()

In [84]:
def last_two_products(dataFrame,usr):   
    """
    This function returns the last two products purchased by one user.
    """
    
    items = dataFrame[dataFrame.ID_Customer == usr].Cod_Fecha.size
    
    times = []
    index_max = 0
    index_product = np.zeros(items)
    index_second_max = 0
    
    for i in range(items):
        index_product[i] = dataFrame[dataFrame.ID_Customer == usr].index[i]
        
        data = dataFrame[dataFrame.ID_Customer == usr].Cod_Fecha[index_product[i]]
        new_year, month = data.split('-')
        new_time = float(new_year) + (float(month)/12)
        times.append(float(new_time))
    
    #Index of last item purchased:
    index_max = times.index(max(times))
    times[index_max] = 0
    #Index of second last item purchased:
    index_second_max = times.index(max(times))
    times[index_second_max] = 0
    
    last_product = dataFrame[dataFrame.ID_Customer == usr].Cod_Prod[index_product[index_max]]
    second_last_product = dataFrame[dataFrame.ID_Customer == usr].Cod_Prod[index_product[index_second_max]]

    return second_last_product, last_product

In [None]:
#We will now generate two vectors with the last two products purchased by the forth categories of users that we made before:

# Category 1:
second_last_products_20_high_income = []
last_products_20_high_income = []

for i in range(unique_users_20_high_income.size):

    second_last_product, last_product = last_two_products(data_20_high_income, unique_users_20_high_income.index[i])

    if second_last_product != last_product:
        second_last_products_20_high_income.append(second_last_product)
        last_products_20_high_income.append(last_product)

        
# Category 2:
second_last_products_20_low_income = []
last_products_20_low_income = []

for i in range(unique_users_20_low_income.size):

    second_last_product, last_product = last_two_products(data_20_low_income, unique_users_20_low_income.index[i])

    if second_last_product != last_product:
        second_last_products_20_low_income.append(second_last_product)
        last_products_20_low_income.append(last_product)

        
# Category 3:
second_last_products_80_high_income = []
last_products_80_high_income = []

for i in range(unique_users_80_high_income.size):

    second_last_product, last_product = last_two_products(data_80_high_income, unique_users_80_high_income.index[i])

    if second_last_product != last_product:
        second_last_products_80_high_income.append(second_last_product)
        last_products_80_high_income.append(last_product)

        
# Category 4:
second_last_products_80_low_income = []
last_products_80_low_income = []

for i in range(unique_users_80_low_income.size):

    second_last_product, last_product = last_two_products(data_80_low_income, unique_users_80_low_income.index[i])

    if second_last_product != last_product:
        second_last_products_80_low_income.append(second_last_product)
        last_products_80_low_income.append(last_product)


In [None]:
# We will now create a function that returns the most likly product that a user will buy afeter buying one product. 

def next_product(number,different_produts,second_last_products,last_products):
    """
    This function returns the product most likely that a user will buy based on the last product purchased ('number') and
    the vectors 'second_last_products' and 'last_products'
    """
    
    count_down = 0

    for i in range(len(last_products)):
        if (last_products[i] == number) or (second_last_products[i] == number):
            count_down += 1        
            
    likelihood = np.zeros(different_produts.size)

    for j in range(different_produts.size):

        count_up = 0
        number_last = different_produts.index[j]
        number_second = number

        for i in range(len(last_products)):
            if (last_products[i] == number_last) and (second_last_products[i] == number_second):
                count_up += 1 
        try:
            likelihood[j] = 100*float(count_up)/float(count_down)
        except:
            likelihood[j] = 0

    index = np.argmax(likelihood)
    
    return different_produts.index[index]

In [None]:
#Now let's test how our model works with the test data created before. 

#First we will extract the last product and second last product purchased by our test users. For the prediccion we will use
# the second last product, and to check how well our model works we will use the last product purchased.

product_previous = []
product_match = []

for i in range(int(unique_users_test.size)):
    
    second_last_product, last_product = last_two_products(data_test, unique_users_test.index[i])

    product_previous.append(second_last_product)
    product_match.append(last_product)
    
#And now we make the prediccion:

values = np.zeros(len(product_previous))

for i in range(int(unique_users_test.size)):
    
    # Number of products purchased by the user:
    total_number_products = data_test[data_test.ID_Customer == unique_users_test.index[i]].Cod_Prod.size - 1
    
    # Income of the user:
    income = data_test[data_test.ID_Customer == unique_users_test.index[i]].Socio_Demo_03.mean()
    
    if total_number_products >= 7 :
        if income >= 3: 
            # Category 1:
            values[i] =next_product(product_previous[i],different_produts,second_last_products_20_high_income,last_products_20_high_income)
        else:
            #Category 2:
            values[i] =next_product(product_previous[i],different_produts,second_last_products_20_low_income,last_products_20_low_income)
    else:
        if income >= 3:
            # Category 3:
            values[i] =next_product(product_previous[i],different_produts,second_last_products_80_high_income,last_products_80_high_income)
        else:
            # Category 4:
            values[i] =next_product(product_previous[i],different_produts,second_last_products_80_low_income,last_products_80_low_income)
            
    

In [None]:
correct_answer = 0

for i in range(len(product_match)):
       
    if product_match[i] == values[i]:
        correct_answer += 1

score = 100*float(correct_answer) / float(len(product_match))
print 'Our model can predict the next product that a user will buy with a score of ', score, '%'

# Conclusiones

Con un método sencillo, basado en probabilidades bayesianas y haciendo una clasificación previa de usuarios, se ha conseguido unos resultados con aciertos promedio del 27%. 

########################################################################################################################
# Method 2: Machine Learning 

In [1]:
import time
import numpy as np
import sklearn
import pandas as pd
from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import ensemble
from sklearn import model_selection

# IMPORT THE DATA
data = pd.read_csv('train2.txt', header = 0, delimiter='|')
# Sort the data by users first, and by date later
data.sort_values(['ID_Customer', 'Cod_Fecha'], ascending=[True, False], inplace=True)

# DATA EXPLORATION
# Array containing all different products:
products_vect = data.groupby('Cod_Prod').size().sort_values(ascending=False, ).index
print 'Cod_Prod and number of appearances:'
print data.groupby('Cod_Prod').size().sort_values(ascending=False, ).head(10)
print 'There are {} different products'.format(data.groupby('Cod_Prod').size().sort_values(ascending=False, ).size)
#print products_vect

Cod_Prod and number of appearances:
Cod_Prod
601     661756
301     426169
201     339686
2302    268166
9993    230423
9991    230423
2205    106478
2704    101727
2601    100163
704      91809
dtype: int64
There are 94 different products


## SPLIT THE DATA INTO 2 GROUPS:

1- 20% of users who purchase MORE products. They generate most of the business for the company and are more likely to buy another product.

2- 80% of users who purchase LESS products.

In [2]:
# SPLIT 20% 80%
# Assumption: clients who have purchased more products are bringing more benefits to CAJAMAR
data_ID_byProd = data.groupby('ID_Customer').size().sort_values(ascending=False, )
IDs_20 = data_ID_byProd.head(np.int(data_ID_byProd.size*0.2+1))
IDs_80 = data_ID_byProd.tail(np.int(data_ID_byProd.size*0.8-1))

In [3]:
print '20% of clients who purchase MORE products are {} clients'.format(IDs_20.shape[0])
print '80% of clients who purchase LESS products are {} clients'.format(IDs_80.shape[0])
print '{}: is the LIMIT of products that a client has purchased to be included in the 20% of top purchasers'.format(IDs_20[-1])

20% of clients who purchase MORE products are 135275 clients
80% of clients who purchase LESS products are 541095 clients
7: is the LIMIT of products that a client has purchased to be included in the 20% of top purchasers


## Work with 20% of USERS WHO PURCHASE MORE PRODUCTS.

# FEATURE EXTRACTION:
    # FOR EVERY USER Create one new variable for each Product with values:
    - 0: if not purchased
    - 1/inverse_order_of_purchasing: if purchased
    
    # EXAMPLE:
    Assuming there are 5 possible products: A, B, C, D, E, F.
    
    Customer1 purchased, in this order, products: A(year 2010),C(year 2011),F(year 2012).
    
    This NEW FEATURE has a field for each product, so has 6 fields (length=94 for Cajamar data).
    
    Then this new feature will be
                A   B   C   D   E   F
    feature = [1/3, 0, 1/2, 0,  0, 1/1]
    
    Because F is the last product he purchased, C the second older and the first was A.
    
    ****
    ##IN THIS WAY, INFORMATION ABOUT WHICH PRODUCTS AND IN WHICH ORDER HAVE BEEN PURCHASED BY A USER IS GATHERED
    ****

In [4]:
ID20_idx = np.in1d(data.ID_Customer.values, IDs_20.index)
data_IDs_20 = data.ix[ID20_idx]
# Obtention of  all the products purchased by user in an array
data_IDs_20_PRODS = data_IDs_20[['ID_Customer','Cod_Prod']].groupby('ID_Customer').agg(lambda x: [np.array(x['Cod_Prod'].values)])
data_IDs_20_PRODS.head()

Unnamed: 0_level_0,Cod_Prod
ID_Customer,Unnamed: 1_level_1
A0000008,"[2707, 9992, 2602, 506, 1001, 3401, 601, 801, ..."
A0000011,"[704, 2205, 1011, 301, 601, 9991, 9993, 2302]"
A0000023,"[2302, 704, 2705, 301, 9991, 9993, 201, 2704, ..."
A0000024,"[2205, 2302, 9991, 9993, 2601, 301, 201, 2704,..."
A0000032,"[9992, 2102, 9993, 9991, 201, 301, 2704, 2302,..."


In [57]:
print '20% OF ACTIVE USERS PURCHASE THE '+str(100*data_IDs_20.shape[0]/data.shape[0])+'% OF THE PRODUCTS'

20% OF ACTIVE USERS PURCHASE THE 40% OF THE PRODUCTS


In [5]:
# SPLIT THE LAST PRODUCT PURCHASED (TARGET) FROM ALL OTHERS (INPUTS)
data_IDs_20_PRODS_inputs = data_IDs_20_PRODS['Cod_Prod'].apply(lambda x: x[1:])
data_IDs_20_PRODS_tagets = data_IDs_20_PRODS['Cod_Prod'].apply(lambda x: x[0])
#print data_IDs_20_PRODS_inputs.head()
#print data_IDs_20_PRODS_tagets.head()

In [6]:
# function that creates new features from puchased products accordingly with the order of purchasing
#  not purchased: 0
#  bought: 1/inverse_order_of_purchasing
def product_features(ID_prods,products_vect):
    products_feature = np.zeros(94)
    for i in range(ID_prods.size):
        idx = np.where(products_vect==ID_prods[i])
        products_feature[idx] = 1.0/(i+1)
    return products_feature

data_ID_20_Prod_aux = data_IDs_20_PRODS_inputs.apply(lambda x:product_features(x,products_vect))
#print data_ID_20_Prod_aux.head()

## BUILDING THE TRAINING DATASET

Consisis of variables Socio_Demo_01, Socio_Demo_02, Socio_Demo_03, Socio_Demo_05 and "the 94 features containing information about the purchased products" for every user 

In [7]:
# BUILD FEATURES DATASET. WITHOUT GENDER, WITH 4 USER ATTRIBUTES, A FEATURE FOR THE DATE AND 94 FEATURES FOR THE PRODUCTS
data_IDs_20_groupedID = data_IDs_20.groupby('ID_Customer').mean()
data_IDs_20_groupedID.drop(['Cod_Prod','Socio_Demo_04'], axis=1, inplace=True)
data_IDs_20_groupedID['Product_feat'] = data_ID_20_Prod_aux.values
features = data_IDs_20_groupedID[[u'Socio_Demo_01', u'Socio_Demo_02', u'Socio_Demo_03', u'Socio_Demo_05']]
feature_prod = data_IDs_20_groupedID.Product_feat.values
train_features_20 = []
for i in range(features.shape[0]):
    train_features_20.append(np.concatenate((feature_prod[i],features.values[i,:])))
train_features_20 = np.array(train_features_20)
print 'TRAIN FEATURES AND THEIR DIMENSION:\n'
print data_IDs_20_groupedID.columns
print train_features_20.shape
# FINALLY A NUMPY ARRAY WITH ALL THE FEATURES FOR EACH USERS CONCATENATED IS OBTAINED
print 'EXAMPLE OF AN ARRAY OF FEATURES FOR A SINGLE USER:'
train_features_20[0]

TRAIN FEATURES AND THEIR DIMENSION:

Index([u'Socio_Demo_01', u'Socio_Demo_02', u'Socio_Demo_03', u'Socio_Demo_05',
       u'Product_feat'],
      dtype='object')
(135275L, 98L)
EXAMPLE OF AN ARRAY OF FEATURES FOR A SINGLE USER:


array([ 0.16666667,  0.1       ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.08333333,  0.09090909,  0.        ,  0.        ,
        0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.5       ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.125     ,  0.14285714,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.25      ,  0.        ,  0.        ,  0.        ,  0.2       ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.11111111,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.33333333,
        0.        ,  0.        ,  0.        ,  0.        ,  0.  

In [8]:
# the TARGETS are set in a numpy array also
train_targets_20 = data_IDs_20_PRODS_tagets.values.reshape(-1,1)

## THE LEARNING PROCESS

- SKLEARN ensemble.RandomForestClassifier
- Parameters of classifier OPTIMIZED by SKLEARN model_selection.GridSearchCV

In [71]:
# entrenar sobre una alíquota.
# CORRER-HO SOBRE EL TEST2.TXT
# REPETIR-HO pel 80%

# All datasets (Train and Test) are extracted from train2.txt file
X_train_20, X_test_20, y_train_20, y_test_20 = sklearn.model_selection.train_test_split( train_features_20, train_targets_20, test_size=0.2, random_state=42)
#model = ensemble.RandomForestClassifier().fit(X_train_20, y_train_20.ravel())
#prediction = model.predict(X_test_20)
#model.score(X_test_20, y_test_20.ravel())

# TRAINING DATA IS TOO BIG, TAKES TOO LONG TO TRAIN MODELS ON IT.
# TAKE SOME SAMPLES FROM ALL DATA TO TRAIN
idx_20 = np.random.randint(y_train_20.size, size=60000)
X_train_20 = X_train_20[idx_20,:]
y_train_20 = y_train_20[idx_20]

In [74]:
#param_grid = {'n_estimators': [50, 75, 100],
#             'criterion': ['gini','entropy']}
start_RF_20 = time.time()

print 'AFTER OPTIMIZING PARAMETERS, n_estimators=100 AND criterion="gini" HAVE BEEN CHOSEN'
param_grid_RF = {'n_estimators': [90],
             'criterion': ['gini']}
RF_opt_20 = model_selection.GridSearchCV(ensemble.RandomForestClassifier(), param_grid_RF)
RF_opt_20.fit(X_train_20, y_train_20.ravel())
prediction_test_20 = RF_opt_20.predict(X_test_20)
accuracy_test_20_RF = RF_opt_20.score(X_test_20, y_test_20.ravel())
print RF_opt_20.best_params_
print 'ACCURACY CV SET: ' + str(RF_opt_20.best_score_)
print 'RANDOM FOREST ACCURACY ON TEST SET: {}'.format(accuracy_test_20_RF)

end_RF_20 = time.time()
print 'TRAINING RANDOM FOREST TOOK {} SECONDS'.format(end_RF_20-start_RF_20)

AFTER OPTIMIZING PARAMETERS, n_estimators=100 AND criterion="gini" HAVE BEEN CHOSEN
{'n_estimators': 90, 'criterion': 'gini'}
ACCURACY CV SET: 0.6565125
RANDOM FOREST ACCURACY ON TEST SET: 0.43507669562
TRAINING RANDOM FOREST TOOK 258.223999977 SECONDS


In [11]:
#start_GB = time.time()
#param_grid_GB =  {'n_estimators': [100,150],
#                'max_depth':[3,4]}
#param_grid_GB =  {'n_estimators': [50],
#                'max_depth':[3]}
#GB_opt = model_selection.GridSearchCV(ensemble.GradientBoostingClassifier(), param_grid_GB)
#GB_opt.fit(X_train_20, y_train_20.ravel())
#prediction_test_20 = GB_opt.predict(X_test_20)
#accuracy_test_20_GB = GB_opt.score(X_test_20, y_test_20.ravel())
#print GB_opt.best_params_
#print 'ACCURACY: ' + str(GB_opt.best_score_)
#print 'GRADIENT BOOSTING ACCURACY ON TEST SET: {}'.format(accuracy_test_20_GB)

#end_GB = time.time()
#print 'TRAINING GRADIENT BOOSTING TOOK {} SECONDS'.format(end_GB-start_GB)

print 'THIS PART IS COMMENTED BECAUSE IT TOOK TOO LONG TO TRAIN A MODEL USING ensemble.GradientBoostingClassifier \nTHESE WERE THE RESULTS: '
print ' - PARAETERS VALUES: {"n_estimators": 50, "max_depth": 3}'
print ' - ACURACY CV SET: 0.45465'
print ' - GRADIENT BOOSTING ACURACY IN TEST SET: 0.437368323785'
print ' - TRAINING GRADIENT BOOSTING TOOK 1203.56599998 SECONDS'
print '\n PROBABLY, INCREASING THE VALUE OF PARAMETERS "n_estimators" AND "max_depth" WOULD IMPROVE THE ACCURACY, BUT IT''S COMPUTATIONALLY TOO EXPENSIVE FOR THIS PROJECT'


THIS PART IS COMMENTED BECAUSE IT TOOK TOO LONG TO TRAIN A MODEL USING ensemble.GradientBoostingClassifier 
THESE WERE THE RESULTS: 
 - PARAETERS VALUES: {"n_estimators": 50, "max_depth": 3}
 - ACURACY CV SET: 0.45465
 - GRADIENT BOOSTING ACURACY IN TEST SET: 0.437368323785
 - TRAINING GRADIENT BOOSTING TOOK 1203.56599998 SECONDS

 PROBABLY, INCREASING THE VALUE OF PARAMETERS "n_estimators" AND "max_depth" WOULD IMPROVE THE ACCURACY, BUT ITS COMPUTATIONALLY TOO EXPENSIVE FOR THIS PROJECT


## ACCURACY EXCLUDING THE 10% OF THE MOST COMMON PRODUCTS

In [75]:
# ACCURACY EXCLUDING THE 10% OF THE MOST COMMON PRODUCTS
num_prods_to_exclude = int(products_vect.size/10)
most_common = products_vect[:num_prods_to_exclude-1]
most_common_idx_20 = np.invert(np.in1d(y_test_20, most_common))# indices for most common products
y_test_20_NO_common = y_test_20[most_common_idx_20]
X_test_20_NO_common = X_test_20[most_common_idx_20,:]

print 'ACCURACY ON DATA WITHOUT 10% MOST COMMON PRODUCTS'
prediction_test_20_RF_noCommon = RF_opt_20.predict(X_test_20_NO_common)
accuracy_test_20_RF_noCommon = RF_opt_20.score(X_test_20_NO_common, y_test_20_NO_common.ravel())
print 'Random Forest accuracy: {}'.format(accuracy_test_20_RF_noCommon)

#prediction_test_20_GB_noCommon = GB_opt.predict(X_test_20_NO_common)
#accuracy_test_20_GB_noCommon = GB_opt.score(X_test_20_NO_common, y_test_20_NO_common.ravel())
#print 'Gradient Boosting accuracy: {}'.format(accuracy_test_20_GB_noCommon)
print 'Gradient Boosting accuracy: 0.314663555037'

ACCURACY ON DATA WITHOUT 10% MOST COMMON PRODUCTS
Random Forest accuracy: 0.302161471356
Gradient Boosting accuracy: 0.314663555037


## Same work for 80% os USERS WHO PURCHASE LESS PRODUCTS

In [13]:
# FEATURE EXTRACTION PROCESS
ID80_idx = np.in1d(data.ID_Customer.values, IDs_80.index)
data_IDs_80 = data.ix[ID80_idx]

# SINCE THIS DATASET IS TOO BIG, 100000 SAMPLES ARA RANDOMLY TAKEN TO WORK ON
data_IDs_80 = data_IDs_80.sample(n=100000)

# Obtention of  all the products purchased by user in an array
data_IDs_80_PRODS = data_IDs_80[['ID_Customer','Cod_Prod']].groupby('ID_Customer').agg(lambda x: [np.array(x['Cod_Prod'].values)])

# SPLIT THE LAST PRODUCT PURCHASED (TARGET) FROM ALL OTHERS (INPUTS)
data_IDs_80_PRODS_inputs = data_IDs_80_PRODS['Cod_Prod'].apply(lambda x: x[1:])
data_IDs_80_PRODS_tagets = data_IDs_80_PRODS['Cod_Prod'].apply(lambda x: x[0])

# APPLY FUNCTION "product_features"  that creates new features from puchased products accordingly with the order of purchasing
#  not purchased: 0
#  bought: 1/inverse_order_of_purchasing
data_ID_80_Prod_aux = data_IDs_80_PRODS_inputs.apply(lambda x:product_features(x,products_vect))

# BUILD THE TRAINING DATATSET
# BUILD FEATURES DATASET. WITHOUT GENDER, WITH 4 USER ATTRIBUTES, A FEATURE FOR THE DATE AND 94 FEATURES FOR THE PRODUCTS
data_IDs_80_groupedID = data_IDs_80.groupby('ID_Customer').mean()
data_IDs_80_groupedID.drop(['Cod_Prod','Socio_Demo_04'], axis=1, inplace=True)
data_IDs_80_groupedID['Product_feat'] = data_ID_80_Prod_aux.values
features = data_IDs_80_groupedID[[u'Socio_Demo_01', u'Socio_Demo_02', u'Socio_Demo_03', u'Socio_Demo_05']]
feature_prod = data_IDs_80_groupedID.Product_feat.values
train_features_80 = []
for i in range(features.shape[0]):
    train_features_80.append(np.concatenate((feature_prod[i],features.values[i,:])))
train_features_80 = np.array(train_features_80)
# FINALLY A NUMPY ARRAY WITH ALL THE FEATURES FOR EACH USERS CONCATENATED IS OBTAINED

# the TARGETS are set in a numpy array also
train_targets_80 = data_IDs_80_PRODS_tagets.values.reshape(-1,1)

In [76]:
# THE LEARNING PROCESS
# All datasets (Train and Test) are extracted from train2.txt file
X_train_80, X_test_80, y_train_80, y_test_80 = sklearn.model_selection.train_test_split( train_features_80, train_targets_80, test_size=0.2, random_state=42)

# TRAINING DATA IS TOO BIG, TAKES TOO LONG TO TRAIN MODELS ON IT.
# TAKE SOME SAMPLES FROM ALL DATA TO TRAIN
idx = np.random.randint(y_train_80.size, size=100000)
X_train_80 = X_train_80[idx,:]
y_train_80 = y_train_80[idx]

# TRAINING RANDOM FOREST CLASSIFIER
start_RF_80 = time.time()

#param_grid_RF = {'n_estimators': [80, 100],
#             'criterion': ['gini','entropy']}
param_grid_RF = {'n_estimators': [40],
             'criterion': ['gini']}
print 'AFTER OPTIMIZING PARAMETERS, n_estimators=40 AND criterion="gini" HAVE BEEN CHOSEN'

RF_opt_80 = model_selection.GridSearchCV(ensemble.RandomForestClassifier(), param_grid_RF)
RF_opt_80.fit(X_train_80, y_train_80.ravel())
prediction_test_80 = RF_opt_80.predict(X_test_80)
accuracy_test_80_RF = RF_opt_80.score(X_test_80, y_test_80.ravel())
print RF_opt_80.best_params_
print 'ACCURACY CV SET: ' + str(RF_opt_80.best_score_)
print 'RANDOM FOREST ACCURACY ON TEST SET: {}'.format(accuracy_test_80_RF)

end_RF_80 = time.time()
print 'TRAINING RANDOM FOREST TOOK {} SECONDS'.format(end_RF_80-start_RF_80)

AFTER OPTIMIZING PARAMETERS, n_estimators=40 AND criterion="gini" HAVE BEEN CHOSEN
{'n_estimators': 40, 'criterion': 'gini'}
ACCURACY CV SET: 0.28296
RANDOM FOREST ACCURACY ON TEST SET: 0.263091724063
TRAINING RANDOM FOREST TOOK 83.8470001221 SECONDS


In [80]:
# ACCURACY EXCLUDING THE 10% OF THE MOST COMMON PRODUCTS
num_prods_to_exclude = int(products_vect.size/10)
most_common = products_vect[:num_prods_to_exclude-1]
most_common_idx_80 = np.invert(np.in1d(y_test_80, most_common))# indices for most common products
y_test_80_NO_common = y_test_80[most_common_idx_80]
X_test_80_NO_common = X_test_80[most_common_idx_80,:]

print 'ACCURACY ON DATA WITHOUT 10% MOST COMMON PRODUCTS'
prediction_test_80_RF_noCommon = RF_opt_80.predict(X_test_80_NO_common)
accuracy_test_80_RF_noCommon = RF_opt_80.score(X_test_80_NO_common, y_test_80_NO_common.ravel())
print 'Random Forest accuracy: {}'.format(accuracy_test_80_RF_noCommon)

ACCURACY ON DATA WITHOUT 10% MOST COMMON PRODUCTS
Random Forest accuracy: 0.00584349593496


########################################################################################################################
## APPLYING MODELS TO test2.txt DATASET.

In [15]:
# IMPORT THE DATA
data_test2 = pd.read_csv('test2.txt', header = 0, delimiter='|')
# Sort the data by users first, and by date later
data_test2.sort_values(['ID_Customer', 'Cod_Fecha'], ascending=[True, False], inplace=True)

# CLASSIFYING CLIENTS: DO THEY BELONG TO ACTIVE USERS SEGMENT OR PASSIVE USERS SEGMENT?


In [16]:
# Obtention of  all the products purchased by user in an array
data_test2_IDs_PRODS = data_test2[['ID_Customer','Cod_Prod']].groupby('ID_Customer').agg(lambda x: [np.array(x['Cod_Prod'].values)])
print data_test2_IDs_PRODS.head()
print str(data_test2_IDs_PRODS.shape[0]) + ' users to predict'

                                                      Cod_Prod
ID_Customer                                                   
B0676372                                                 [601]
B0676373                                                 [601]
B0676374     [9991, 9992, 1011, 201, 2302, 2601, 2704, 301,...
B0676376                            [9993, 707, 201, 704, 601]
B0676377                                            [201, 601]
258989 users to predict


In [17]:
# CLASSIFY USERS INTO "ACTIVE" OR "PASSIVE"
products_per_client = data_test2_IDs_PRODS['Cod_Prod'].apply(lambda x: x.size)
test2_active_idx = products_per_client.values >= 7
test2_passive_idx = np.invert(test2_active_idx)

In [18]:
test2_active, test2_passive = data_test2_IDs_PRODS[test2_active_idx], data_test2_IDs_PRODS[test2_passive_idx]
# Convert to series
test2_active = test2_active['Cod_Prod']
test2_passive = test2_passive['Cod_Prod']

In [19]:
# CREATE FEATURES FOR EVERY USERS
# APPLY FUNCTION "product_features"  that creates new features from puchased products accordingly with the order of purchasing
#  not purchased: 0
#  bought: 1/inverse_order_of_purchasing
data_test2_ACTIVE_Prod_aux = test2_active.apply(lambda x:product_features(x,products_vect))
data_test2_PASSIVE_Prod_aux = test2_passive.apply(lambda x:product_features(x,products_vect))


In [20]:
# BUILD FEATURES DATASET. WITHOUT GENDER, WITH 4 USER ATTRIBUTES, A FEATURE FOR THE DATE AND 94 FEATURES FOR THE PRODUCTS
ID_ACT_idx = np.in1d(data_test2.ID_Customer.values, test2_active.index)
data_IDs_ACT = data_test2.ix[ID_ACT_idx]
data_IDs_ACT_groupedID = data_IDs_ACT.groupby('ID_Customer').mean()
data_IDs_ACT_groupedID.drop(['Cod_Prod','Socio_Demo_04'], axis=1, inplace=True)
data_IDs_ACT_groupedID['Product_feat'] = data_test2_ACTIVE_Prod_aux.values
features_ACT = data_IDs_ACT_groupedID[[u'Socio_Demo_01', u'Socio_Demo_02', u'Socio_Demo_03', u'Socio_Demo_05']]
feature_prod_ACT = data_IDs_ACT_groupedID.Product_feat.values
features_ACT_test2 = []
for i in range(features_ACT.shape[0]):
    features_ACT_test2.append(np.concatenate((feature_prod_ACT[i],features_ACT.values[i,:])))
features_ACT_test2 = np.array(features_ACT_test2)
# FINALLY A NUMPY ARRAY WITH ALL THE FEATURES FOR EACH USERS CONCATENATED IS OBTAINED

ID_PAS_idx = np.in1d(data_test2.ID_Customer.values, test2_passive.index)
data_IDs_PAS = data_test2.ix[ID_PAS_idx]
data_IDs_PAS_groupedID = data_IDs_PAS.groupby('ID_Customer').mean()
data_IDs_PAS_groupedID.drop(['Cod_Prod','Socio_Demo_04'], axis=1, inplace=True)
data_IDs_PAS_groupedID['Product_feat'] = data_test2_PASSIVE_Prod_aux.values
features_PAS = data_IDs_PAS_groupedID[[u'Socio_Demo_01', u'Socio_Demo_02', u'Socio_Demo_03', u'Socio_Demo_05']]
feature_prod_PAS = data_IDs_PAS_groupedID.Product_feat.values
features_PAS_test2 = []
for i in range(features_PAS.shape[0]):
    features_PAS_test2.append(np.concatenate((feature_prod_PAS[i],features_PAS.values[i,:])))
features_PAS_test2 = np.array(features_PAS_test2)
# FINALLY A NUMPY ARRAY WITH ALL THE FEATURES FOR EACH USERS CONCATENATED IS OBTAINED

In [21]:
# RUN PREDICTIVE MODELS ON TEST SAMPLES
prediction_test2_ACT = RF_opt_20.predict(features_ACT_test2)
prediction_test2_PAS = RF_opt_80.predict(features_PAS_test2)

In [37]:
#  DELIVERY DATASET
result_ACT = np.concatenate((np.array(test2_active.index).reshape(-1,1), prediction_test2_ACT.reshape(-1,1)), axis=1)
result_PAS = np.concatenate((np.array(test2_passive.index).reshape(-1,1), prediction_test2_PAS.reshape(-1,1)), axis=1)
dataset_final = np.concatenate((result_ACT,result_PAS))
dataset_final = dataset_final[dataset_final[:,0].argsort()]

In [44]:
# SAVE TO .txt
DATAFRAME_ENTREGA = pd.DataFrame(data=dataset_final, columns=["ID_Customer","Cod_Prod"])
DATAFRAME_ENTREGA.to_csv('Test_Mission.txt',sep='|', index=False)