In [1]:
%matplotlib inline

In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
plt.style.use("ggplot")
pd.set_option("display.max_columns", 300)
PROJECT_DIR = os.path.abspath(os.path.join("..",os.path.curdir))
PROJECT_DATA_DIR = os.path.join(PROJECT_DIR,"data")

# Create dataset (with target variable)

In [4]:
# We just have to change which months
# we will use to train the model.
# Just get the two most recent months
# for which we have available data,
# and we are good to go.

# For example: month = "201710"
#          and month_next = "201711"
month="201710"
month_next="201711"

In [5]:
tarificador_pre = pd.read_csv(os.path.join(PROJECT_DATA_DIR,"input","EXTR_INFO_TARIF_PRE_" + month + ".TXT"),
                                   sep="|",
                                   encoding="ISO-8859-1")

ac_final_prepago = pd.read_csv(os.path.join(PROJECT_DATA_DIR,"input","AC_FINAL_PREPAGO_" + month + ".TXT"),
                                   sep="|",
                                   encoding="ISO-8859-1")

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
tarificador_pre["ActualVolume_MB"] = tarificador_pre["ActualVolume"] / float(1024**2)

In [7]:
join_ac_tarificador = ac_final_prepago.merge(tarificador_pre,
                                             how="inner",
                                             on="MSISDN",
                                             suffixes=["_ac","_tarificador"])

In [8]:
ac_prepago_next = pd.read_csv(os.path.join(PROJECT_DATA_DIR,"input","AC_FINAL_PREPAGO_" + month_next + ".TXT"),
                              sep="|",
                              encoding="ISO-8859-1")

  interactivity=interactivity, compiler=compiler, result=result)


In [9]:
prepago_target = join_ac_tarificador.merge(ac_prepago_next[["MSISDN", 
                                                           "NUM_DOCUMENTO_COMPRADOR", 
                                                           "Codigo_Plan_Precios"]],
                                           how="inner",
                                           on=["MSISDN", "NUM_DOCUMENTO_COMPRADOR"],
                                           suffixes=["","_nextMonth"])

In [10]:
map_tariffs_quotas = {"PPFCL": 0.0,
                      "PPFCS": 5.0,
                      "PPTIN": 15.0,
                      "PPVIS": 15.0,
                      "PPVSP": 15.0,
                      "PPIB7": 10.0,
                      "PPIB8": 15.0,
                      "PPIB9": 20.0,
                      "PPIB4": 10.0,
                      "PPIB5": 15.0,
                      "PPIB6": 20.0,
                      "PPIB1": 10.0,
                      "PPIB2": 15.0,
                      "PPIB3": 20.0,
                      "PPXS8": 5.0,
                      "PPVE1": 15.0,
                      "PPVE2": 15.0,
                      "PPREU": 5.0,
                      "PPREY": 5.0,
                      "PPREX": 5.0,
                      "PPRET": 5.0,
                      "PPRE5": 0.0,
                      "PPRE2": 0.0,
                      "PPRES": 0.0,
                      "PPJ24": 0.0, # Asumo que es de 0...
                      "PPJAT": 0.0, # Asumo que es de 0...
                      "PPJMI": 0.0, # Asumo que es de 0...
                      "PPVE3": 0.0, # Asumo que es de 0...
                     }

prepago_target["Quota"] = prepago_target["Codigo_Plan_Precios"].map(map_tariffs_quotas)
prepago_target["Quota_nextMonth"] = prepago_target["Codigo_Plan_Precios_nextMonth"].map(map_tariffs_quotas)

In [11]:
# Creation of target (upsell):
prepago_target["Upsell"] = (prepago_target
                            .apply(lambda row: 1 if row["Quota_nextMonth"] > row["Quota"]
                                   else 0, axis=1)
                           )

In [12]:
feature_numeric_columns = ["NUM_PREPAGO",
                           "NUM_POSPAGO",
                           "MIN_LLAM_ULTMES",
                           "ULT3MESES_TOTAL",
                           "DIASDESDEULTRECARGA",
                           "NUMRECARGASULT3MESES_TOTAL",
                           "COBERTURA_4G",
                           "LORTAD",
                           "MOU",
                           "TOTAL_LLAMADAS",
                           "TOTAL_SMS",
                           'MOU_Week',
                           'LLAM_Week',
                           'SMS_Week',
                           'MOU_Weekend',
                           'LLAM_Weekend',
                           'SMS_Weekend',
                           'MOU_VF',
                           'LLAM_VF',
                           'SMS_VF',
                           'MOU_Fijo',
                           'LLAM_Fijo',
                           'SMS_Fijo',
                           'MOU_OOM',
                           'LLAM_OOM',
                           'SMS_OOM',
                           'MOU_Internacional',
                           'LLAM_Internacional',
                           'SMS_Internacional',
                           #'ActualVolume',
                           'Num_accesos',  
                           'Num_Cambio_Planes',
                           'LLAM_COMUNIDAD_SMART',
                           'MOU_COMUNIDAD_SMART',
                           'LLAM_SMS_COMUNIDAD_SMART',
                           #'Flag_Uso_Etnica',
                           'cuota_SMART8',
                           'cuota_SMART12',
                           'cuota_SMART16',
                           'ActualVolume_MB']

feature_categorical_columns = ["Tipo_Documento_Comprador", # Categorica
                               'Codigo_Plan_Precios' # Categorica
                              ]

target_column = "Upsell"

In [13]:
ml_dataset = prepago_target[feature_numeric_columns 
                            + feature_categorical_columns
                            + [target_column]].copy()

In [14]:
for feature in feature_categorical_columns:
    categorical_column = feature + "_" + ml_dataset[feature].fillna("MISSING").str.lower()
    categorical_column = pd.get_dummies(categorical_column)
    ml_dataset[categorical_column.columns] = categorical_column
    del ml_dataset[feature]
    
for feature in feature_numeric_columns:
    ml_dataset[feature] = ml_dataset[feature].fillna(0.0).astype(np.float64)

In [15]:
feature_columns = [feature for feature in ml_dataset.columns if feature != target_column]

# Modelling

In [16]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(ml_dataset, 
                               train_size=0.8, 
                               test_size=0.2)

In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rf = RandomForestClassifier()

param_grid = {"n_estimators":[300],
              "max_features":["sqrt", 0.3, 0.6],
              "max_depth": [5, 9, 15, 30],
              "n_jobs":[-1],
              "class_weight":["balanced"]
             }

gs_rf = GridSearchCV(estimator=rf,
                     param_grid=param_grid,
                     scoring="roc_auc",
                     cv=5,
                     verbose=3)

In [None]:
gs_rf.fit(train[feature_columns],
          train[target_column])

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] class_weight=balanced, max_depth=5, max_features=sqrt, n_estimators=300, n_jobs=-1 
[CV]  class_weight=balanced, max_depth=5, max_features=sqrt, n_estimators=300, n_jobs=-1, score=0.7984860415028946, total= 6.3min
[CV] class_weight=balanced, max_depth=5, max_features=sqrt, n_estimators=300, n_jobs=-1 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  6.5min remaining:    0.0s


[CV]  class_weight=balanced, max_depth=5, max_features=sqrt, n_estimators=300, n_jobs=-1, score=0.7885900005973775, total= 6.1min
[CV] class_weight=balanced, max_depth=5, max_features=sqrt, n_estimators=300, n_jobs=-1 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 12.9min remaining:    0.0s


[CV]  class_weight=balanced, max_depth=5, max_features=sqrt, n_estimators=300, n_jobs=-1, score=0.7835945008879207, total= 6.0min
[CV] class_weight=balanced, max_depth=5, max_features=sqrt, n_estimators=300, n_jobs=-1 
[CV]  class_weight=balanced, max_depth=5, max_features=sqrt, n_estimators=300, n_jobs=-1, score=0.7939902265350394, total= 6.1min
[CV] class_weight=balanced, max_depth=5, max_features=sqrt, n_estimators=300, n_jobs=-1 
[CV]  class_weight=balanced, max_depth=5, max_features=sqrt, n_estimators=300, n_jobs=-1, score=0.7987876505309517, total= 6.1min
[CV] class_weight=balanced, max_depth=5, max_features=0.3, n_estimators=300, n_jobs=-1 
[CV]  class_weight=balanced, max_depth=5, max_features=0.3, n_estimators=300, n_jobs=-1, score=0.8119401315116378, total=13.2min
[CV] class_weight=balanced, max_depth=5, max_features=0.3, n_estimators=300, n_jobs=-1 
[CV]  class_weight=balanced, max_depth=5, max_features=0.3, n_estimators=300, n_jobs=-1, score=0.8017495408727551, total=15.3mi

In [None]:
print("Summary for all models launched (best one first):")

pd.DataFrame(gs_rf.cv_results_).sort_values(by="mean_test_score", ascending=False)

In [None]:
best_model = gs_rf.best_estimator_

In [None]:
best_model

# AUC for best model

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score


fpr, tpr, _ = roc_curve(y_true=test[target_column],
                        y_score=best_model.predict_proba(test[feature_columns])[:,1],
                        pos_label=1.0
                       )
df_roc_test = pd.DataFrame({"fpr":fpr, 
                            "tpr":tpr})

auc_test = roc_auc_score(test[target_column], 
                         best_model.predict_proba(test[feature_columns])[:,1],
                         average="micro")

# ROC plotting
plt.figure(figsize=(12,9))

# ROC curves:
plt.plot(df_roc_test["fpr"], df_roc_test["tpr"], "yellow", label="Test  (AUC: %.4f)" % auc_test)

# Random guess:
plt.plot([0,1],[0,1], "b--")

# Colour
plt.fill_between(x=df_roc_test["fpr"], y1=df_roc_test["tpr"], alpha=0.2, color="yellow")

plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC curves")
plt.legend(loc=4, fontsize=10)
pass

# Model saving

In [None]:
import pickle

save_model_path = os.path.join(PROJECT_DIR, "models", "model_identificator_upsellers_v02.pkl")
with open(save_model_path, "wb") as file_:
    pickle.dump(best_model, file_)
    
print("The trained model has been saved to %s" % (save_model_path))
print("Everything done in this notebook! You can use this model to predict any month that you want")