In [None]:
#boxes.apply(missing_values)

**Note:** only customer has missing values (*NaN*), and the missing information can be found in another instances of the same customer.

In [None]:
sub_id = 1654222
mv_sw = boxes[(boxes.started_week.notnull())&(boxes.subscription_id == sub_id)].started_week.iloc[0]
boxes.loc[boxes.started_week.isna(),'started_week'] = mv_sw

*Creating feature **time as customer***

In [None]:
started_week_full = pd.to_datetime(boxes.started_week.apply(conv_weeknumformat_to_datetime))
current_ym = pd.to_datetime(boxes.delivery_date.max())

**Note:** We assume *2017-12* as current year-month and thus we predict churn for the near future (i.e. 1/2/3 months).

In [None]:
boxes['time_as_customer'] = started_week_full.apply(lambda x: abs((x.year - current_ym.year) * 12 + 
                                                                  (x.month - current_ym.month)) )

#### Problems with compensation typer on *Errros dataset*

In [None]:
# Compensation type
errors.loc[errors.compensation_amount == 0,'compensation_type'] = 'none'
errors.loc[errors.compensation_type == 'full refund','compensation_type'] = 'full_refund'
errors.loc[errors.compensation_type == 'sorry','compensation_type'] = 'refund'

---

#### Featuring extraction

*Get customers list*

In [None]:
subscription_id = get_subscriptions_list(boxes)

*List of churners and alives (non-churners)*

In [None]:
summary, churn_sub, survive_sub = get_subscriptions_current_status(boxes, cancels, plot=False)

*Dataframe for feature extraction*

In [None]:
base = pd.DataFrame(list(subscription_id), columns=['subscription_id'])

**Feature: time as customer**

In [None]:
base['time_as_customer'] = boxes[['subscription_id','time_as_customer']].drop_duplicates().time_as_customer.values

**Feature: number of pauses**

In [None]:
pauses_l6months = pauses.drop_duplicates().groupby(['subscription_id']).subscription_id.count()

pp = []
for i in subscription_id:
    if i in pauses_l6months.index:
        pp.append(pauses_l6months[i])
    else:
        pp.append(0)

base['number_pauses'] = pp

**Feature: number of different boxes**

In [None]:
boxes = pd.read_csv("boxes.csv", dtype={'subscription_id':int})
pauses = pd.read_csv("pauses.csv", dtype={'subscription_id':int})
cancels = pd.read_csv("cancels.csv")
errors = pd.read_csv("errors.csv")

In [None]:
n = boxes.shape[0]

### Preprocessing

#### Adjusting datetime

In [None]:
boxes.delivery_date = pd.to_datetime(boxes.delivery_date)
pauses.pause_start = pd.to_datetime(pauses.pause_start)
pauses.pause_end = pd.to_datetime(pauses.pause_end)
cancels.event_date = pd.to_datetime(cancels.event_date)
errors.reported_date = pd.to_datetime(errors.reported_date)

#### Pause length

In [None]:
pauses["pause_length"] = pauses.pause_end - pauses.pause_start
pauses["pause_start_ym"] = [ t.strftime("%Y") for t in pauses.pause_start]

**Note:** Pause takes 6 days. However, there are few rows presenting *-360 days*. This is clarity an error. Therefore, we could either estimate the pause time period, or remove them. Since there are only few rows, we will remove these instances.

In [None]:
pauses = pauses[pauses.pause_length == '6 days']

#### Missing values :: started_week
Uncomment the next cell to verify the missing values per feature

### Processing

In [None]:
X = base.iloc[:,1:-1]
# Drop 'num_cancellation' to evaluate the performance of models
# You will find the explanation soon
X = X.drop('num_cancellation', axis=1)

y = base.iloc[:,-1]

le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20, stratify=y)

In [None]:
se = StandardScaler().fit(X_train)

X_std = se.transform(X)
X_train_std = se.transform(X_train)
X_test_std = se.transform(X_test)

Oversampling

In [None]:
orus = RandomOverSampler(random_state=0)
oX_resampled, oY_resampled = orus.fit_sample(X, y)

# Splitting dataset
X_train_os, X_test_os, y_train_os, y_test_os = train_test_split(oX_resampled, oY_resampled, test_size=.20)

# Transformation
se_os = StandardScaler().fit(X_train_os)
X_train_os_std = se_os.transform(X_train_os)
X_test_os_std = se_os.transform(X_test_os)

In [None]:
scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}

**RandomFortest**

*Cross-validation*

In [None]:
rfm_cv = RandomForestClassifier()

stf_fold = StratifiedKFold(10)
res = pd.DataFrame.from_dict( cross_validate(rfm_cv, X, y, cv=stf_fold, 
                                             scoring=scoring, 
                                             return_train_score=False) )
cv_print_evaluation(res)

**Note:** The use of **StratifiedKFold** insted of **KFold** is necessary because the dataset suffers from imbalanced dataset problem. Therefore, it is recommended to use stratified sampling to ensure that relative class frequencies is approximately preserved in each train and validation fold.

*Oversampled data + model and predictions*

In [None]:
rfm = RandomForestClassifier()
rfm.fit(X_train_os, y_train_os)

rfm_predictions = rfm.predict(X_test_os)
rfm_pred_prob = rfm.predict_proba(X_test_os)
print_evaluation(y_test_os, rfm_predictions)

n_features = 5

print("\nImportant features:")
imp_features = get_kimportant_features(rfm.feature_importances_, X.columns, k=n_features, prt=True)

**Notes:** 

1. *num_cancellation*, *time_as_customer*, *num_diff_boxes* and *number_pauses* are really significant for the model.
2. Unbalanced data and oversampled data achieve the same performance. However, this is thanks to the *num_cancellation* feature which has the highest importance. If we drop it from the **base dataset**, we notice that the F1-score (CV) was 0.833 and accuracy 0.759. On the other hand, the oversampled data + model approach presented a very similar accuracy (0.973) and F-score (0.973) without *num_cancellation*.

**Logistic Regression**

In [None]:
lr_cv = LogisticRegression()

stf_fold = StratifiedKFold(10)
res = pd.DataFrame.from_dict( cross_validate(lr_cv, X_std, y, cv=stf_fold, 
                                             scoring=scoring, 
                                             return_train_score=False) )
cv_print_evaluation(res)

In [None]:
lr = LogisticRegression()
lr.fit(X_train_os, y_train_os)

lr_predictions = lr.predict(X_test_os)
lr_pred_prob = lr.predict_proba(X_test_os)

print_evaluation(y_test_os, lr_predictions)

**Note:**
1. If we keep the *num_cancellation* in our **base dataset**, we achive 100% accuracy in all performance metrics, that is, no prediction errors (i.e. confusion matrix shows such an information).
2. We also predict the probability of churn to focus on the customer with a high probability of churn.
3. Extract more futures and improve the model is possible as well as obtain insights from the model to create actions to prevent the churn.
4. Different graphs were used to support the creationg of this model. However, we must understand the customer behavior better. The next graph is very important to verify the peaks of churns based on time subscription.

## Information

### Task

Explore the datasets and develop a model to predict either customer churn over time OR weekly demand for each product type.

### Required libraries to run this notebook:

Please, install the following libraries before running this jupyter notebook:
- Sklearn
- Pandas
- Imblearn
- Plotly
- seaborn

### My notes

#### Size per dataset
- Boxes   : (4552066, 6)
- pauses  : (5921657, 3)
- cancels : (1408824, 3)
- errors  : (417689, 5)

#### Errors dataset:
1. What kind of compensation type *sorry* is?
2. 'Sorry' compensation type has only 3 instances. However, they were refunded. Therefore, we should change this type of compensation.
3. Mistyping error: *full_refund* and *full refund* are present in the error dataset.
4. Credict compensation type with compensation amount equals zero. It is an odd situation that could be either compensation type equals zero, or compensation amount is missing. It also happens with full_refund, partial_refund, and refund.
5. *Refund* compensation type is more like a generalization than a specific type of compensation such as full or partial.



## Implementation

### Libraries

In [None]:
import plotly
import operator
import datetime
import numpy as np
import pandas as pd
import seaborn as sbn
import plotly.graph_objs as go

from collections import Counter

from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import make_scorer, accuracy_score, recall_score, precision_score, f1_score, confusion_matrix

from imblearn.over_sampling import RandomOverSampler

plotly.offline.init_notebook_mode()
%matplotlib inline

### Functions

In [None]:
def get_subscriptions_list(b):
    return set(b.subscription_id.value_counts().index.tolist())

def get_cancel_sub_list(c):
    return set(c[c.event_type == 'cancellation'].subscription_id.tolist())

def get_subscriptions_current_status(b,c, plot=False):
    sublist = get_subscriptions_list(b)
    csbc_id = get_cancel_sub_list(c)
    
    churn_subscription = sublist.intersection(csbc_id)
    survive_subscription = sublist.difference(csbc_id)
    
    n = len(sublist)
    ch_pct = len(churn_subscription) * 100.0 / n
    su_pct = len(survive_subscription) * 100.0 / n
    
    summary = pd.DataFrame([[len(churn_subscription), len(survive_subscription)]], 
                           columns=['qtd_churn','qtd_survive'], index=["qtd"])
    
    if plot:
        gData = [go.Bar(y=summary.qtd_churn, x=summary.index, name="Amount of churn"),
                 go.Bar(y=summary.qtd_survive, x=summary.index, name="Amount of survive")]
        gLay = go.Layout(title="Current status: Amount of churn (%.2f%%) and surive (%.2f%%)" % (ch_pct, su_pct))
        fig = go.Figure(data=gData, layout=gLay)
        plotly.offline.iplot(fig)
    
    return summary, churn_subscription, survive_subscription

def conv_weeknumformat_to_datetime(d):
    year = int(d[:4])
    weeknum = int(d[-2:])
    month = int(weeknum/4) # number of weeks
    if month < 1:
        month = 1
    elif month > 12:
        month = 12
    
    mydate = "%i-%i-01" % (year,month)
    return mydate

def missing_values(x):
    return sum(x.isna())

def print_evaluation(y_true, y_pred):
    acc = round(accuracy_score(y_true, y_pred), 3)
    pre = round(precision_score(y_true, y_pred), 3)
    rec = round(recall_score(y_true, y_pred), 3)
    fsc = round(f1_score(y_true, y_pred), 3)
    cfm = confusion_matrix(y_true,y_pred)
    
    print("Evaluation:")
    print(" - Accuracy    = {0}".format( acc ))
    print(" - Precision  = {0}".format( pre ))
    print(" - Recall      = {0}".format( rec ))
    print(" - F-measure   = {0}".format( fsc ))
    
    print("\n------------------------------")
    print("Confusion Matrix")
    sbn.set(font_scale=1.2)#for label size
    sbn.heatmap(cfm, annot=True, fmt="d", annot_kws={"size": 16})# font size
    print("------------------------------")

    
def cv_print_evaluation(res):
    acc = round(res.test_accuracy.mean(), 3)
    pre = round(res.test_precision.mean(), 3)
    rec = round(res.test_recall.mean(), 3)
    fsc = round(res.test_f1_score.mean(), 3)
    
    print("""--------------------------
     Model performance\n--------------------------""")
    print("- Accuracy:   %5s %.3f\n" % ("", acc))
    print("- Precision:  %5s %.3f\n" % ("", pre))
    print("- Recall:     %5s %.3f\n" % ("", rec))
    print("- F1-Measure: %5s %.3f" % ("", fsc))
    print("--------------------------")
    
def get_kimportant_features(importances, features, k, prt=False):
    imp_ftres = {}
    for c,i in zip(features, importances): 
        imp_ftres[c] = i
    
    fimp = sorted(imp_ftres.items(), key=operator.itemgetter(1), reverse=True)
    sel_features = dict(fimp[:k])
    if prt:
        for k in sel_features.items():
            print("%20s : %.3f" % (k[0],k[1]))
    
    return sel_features

### Datasets

In [None]:
base['num_diff_boxes'] = boxes[['subscription_id',
                                'box_id']].drop_duplicates().groupby('subscription_id').box_id.count().values

**Feature: number of different products**

In [None]:
base['num_diff_products'] = boxes[['subscription_id',
                                   'product']].drop_duplicates().groupby(['subscription_id']).product.count().values

**Features: Quantity of cancellation and Quantity of reactivation**

In [None]:
cr = cancels.drop_duplicates().groupby(['subscription_id','event_type']).event_type.count().unstack()
cr.fillna(0, inplace=True)

canc = []
react = []
for i in subscription_id:
    if i in cr.index:
        v = cr.loc[i]
        canc.append(int(v.cancellation))
        react.append(int(v.reactivation))
    else:
        canc.append(0)
        react.append(0)

base['num_cancellation'] = canc
base['num_reactivation'] = react

**Features: compensation types**

In [None]:
ect = errors.drop_duplicates().groupby(['subscription_id',
                                        'compensation_type']).compensation_type.count().unstack()

ect.fillna(0, inplace=True)

credit = []
full_refund = []
none = []
partial_refund = []
refund = []
for i in subscription_id:
        
    if i in ect.index:
        v = ect.loc[i]

        credit.append(v.credit)
        full_refund.append(v.full_refund)
        none.append(v.none)
        partial_refund.append(v.partial_refund)
        refund.append(v.refund)
    else:
        credit.append(0)
        full_refund.append(0)
        none.append(0)
        partial_refund.append(0)
        refund.append(0)
        
base['qtd_credit'] = credit
base['qtd_full_refund'] = full_refund
base['qtd_none'] = none
base['qtd_partial_refund'] = partial_refund
base['qtd_refund'] = refund

**Features: number of erros and compensation amount**

In [None]:
tepc = ect.sum(axis=1)
amt = errors.drop_duplicates().groupby(['subscription_id']).compensation_amount.median()

In [None]:
qtd_erros = []
amount_comp = []
for i in subscription_id:
    if i in amt:
        ac = amt.loc[i]
        amount_comp.append(ac)
    else:
        amount_comp.append(0)
    
    if i in tepc.index:
        v = tepc.loc[i]        
        qtd_erros.append(v)
    else:
        qtd_erros.append(0)
        
base['qtd_erros'] = qtd_erros
base['amount_comp'] = amount_comp

**Feature: status (churn/alive)**