In [1]:
import numpy as np
import pandas as pd

import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

In [2]:
Train = pd.read_csv("data/Train.csv")
Test = pd.read_csv("data/Test.csv")

In [3]:
#what features are categorical?
categorical_features = Train.select_dtypes(include=['object','category']).columns.tolist()

#what features are numerical?
numerical_features = Train.select_dtypes(include=['int64','float64']).columns.tolist()
print("categorical features", categorical_features)
print()
print("numerical features",numerical_features)

categorical features ['user_id', 'REGION', 'TENURE', 'MRG', 'TOP_PACK']

numerical features ['MONTANT', 'FREQUENCE_RECH', 'REVENUE', 'ARPU_SEGMENT', 'FREQUENCE', 'DATA_VOLUME', 'ON_NET', 'ORANGE', 'TIGO', 'ZONE1', 'ZONE2', 'REGULARITY', 'FREQ_TOP_PACK', 'CHURN']


In [4]:
defective_features = ['user_id', 'MRG',"TOP_PACK"]
Train.drop(defective_features, 1, inplace = True)
Test.drop(defective_features, 1, inplace = True)

In [5]:
ntrain = Train.shape[0]
ntest = Test.shape[0]
data = pd.concat((Train, Test)).reset_index(drop=True)

In [6]:
#AVERAGE OF THE TENURE BOUNDARIES
data['TENURE_avg'] = data['TENURE'].map({'K > 24 month': (24+27)/2, 'I 18-21 month':(18+21)/2 , 'H 15-18 month': (15+18)/2, 'G 12-15 month':(12+15)/2,
                                             'J 21-24 month': (21+24)/2, 'F 9-12': (9+12)/2, 'E 6-9 month':(6+9)/2, 'D 3-6 month':(3+6)/2})

data['TENURE'] = data['TENURE'].map({'K > 24 month': 24, 'I 18-21 month': 18, 'H 15-18 month': 15, 'G 12-15 month':12,
                                             'J 21-24 month': 21, 'F 9-12': 9, 'E 6-9 month':6, 'D 3-6 month':3})


In [7]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
data['REGION_encoded']=le.fit_transform(data['REGION'])
data.drop(['REGION'],1,inplace=True)

In [8]:
data['Total_income'] = data['REVENUE'] * data['FREQUENCE']
data['FREQ_PACK'] = data.FREQUENCE_RECH/data.FREQ_TOP_PACK
data['diff//freq'] = (data['MONTANT'] - data['FREQUENCE_RECH']) / data['FREQUENCE']
data['NOT_FREQUENCE_RECH'] = data['FREQUENCE_RECH'] - data['FREQ_TOP_PACK']

In [9]:
data['diff_Orange'] = np.abs(data['ON_NET']-data['ORANGE'])
data['diff_Tigo'] = np.abs(data['ON_NET']-data['TIGO'])
data['freq//rech'] = data['FREQUENCE'] / data['FREQUENCE_RECH']
data['freq//montant'] =  data['MONTANT']/ data['FREQUENCE']
data['freq//revenue'] = data['FREQUENCE'] / data['REVENUE']


In [10]:
data['segment/reg'] = data['ARPU_SEGMENT'] / data['REGULARITY']
data['net//reg'] = data['ON_NET'] / data['REGULARITY']
data['data//reg'] = data['DATA_VOLUME'] / data['REGULARITY']

In [11]:
train_data = data[:ntrain]
test_data = data[ntrain:]
target = train_data['CHURN']
train_data.drop(["CHURN"],1,inplace=True)
test_data.drop(["CHURN"],1,inplace=True)

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

In [13]:
#Segregate data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(train_data, target, test_size=0.33, stratify=target,random_state=56)
skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=23)

In [14]:
from sklearn.metrics import log_loss,roc_auc_score
import lightgbm as lgbm
import xgboost as xgb
from catboost import CatBoostClassifier
import lightgbm as lgbm

In [15]:
def predict(estimator,data):
    predictions = estimator.predict_proba(data)[:,1]
    return predictions

In [20]:
#Initialize models and their hyper-parameters
xgb_model = xgb.XGBClassifier(n_jobs=-1,random_state=23,objective='binary:logistic',
                n_estimators=2500,learning_rate=0.01,
                colsample_bytree=0.9,subsample=1, use_label_encoder=False)
cat_model = CatBoostClassifier(random_seed=34,use_best_model=True,
                          n_estimators=5000,silent=True,eval_metric='Logloss')
lgb = lgbm.LGBMClassifier(random_state=734, n_estimators=5000,num_leaves=120,learning_rate=0.008,
    max_depth=9)

In [17]:
#Define training loop
def train(estimator):
    fold=0
    scores,pp=[],[]
    for train_index, test_index in skf.split(X_train,y_train):
        fold+=1
        print(f"================Fold:{fold}====================")
        xtrain, xtest = X_train.iloc[train_index],X_train.iloc[test_index]
        ytrain, ytest = y_train.iloc[train_index],y_train.iloc[test_index]
        model = estimator.fit(xtrain,ytrain,early_stopping_rounds=200,eval_metric="logloss",
                      eval_set=[(xtest,ytest)],verbose=250)
        prediction = model.predict_proba(xtest)
        predict_ = model.predict_proba(test_data)
        score = log_loss(ytest,prediction)
        print(f"Log loss for {fold}: {score}")
        scores.append(score)
        pp.append(predict_)

        #0.2528473
      #Baseline Mean: 0.25479403  
    print(f"The Mean Log_loss eror: {np.mean(scores)}")##0.252703
    return pp

#Train Xgboost model
xgb_pred = train(xgb_model)

[0]	validation_0-logloss:0.68646
[250]	validation_0-logloss:0.27266
[500]	validation_0-logloss:0.25382
[750]	validation_0-logloss:0.25295
[964]	validation_0-logloss:0.25300
Log loss for 1: 0.25294188923595684
[0]	validation_0-logloss:0.68648
[250]	validation_0-logloss:0.27354
[500]	validation_0-logloss:0.25427
[750]	validation_0-logloss:0.25324
[953]	validation_0-logloss:0.25325
Log loss for 2: 0.253237936310374
[0]	validation_0-logloss:0.68649
[250]	validation_0-logloss:0.27385
[500]	validation_0-logloss:0.25463
[750]	validation_0-logloss:0.25363
[1000]	validation_0-logloss:0.25364
[1011]	validation_0-logloss:0.25365
Log loss for 3: 0.25360825973937334
[0]	validation_0-logloss:0.68648
[250]	validation_0-logloss:0.27200
[500]	validation_0-logloss:0.25231
[750]	validation_0-logloss:0.25119
[944]	validation_0-logloss:0.25127
Log loss for 4: 0.251189002210279
[0]	validation_0-logloss:0.68647
[250]	validation_0-logloss:0.27262
[500]	validation_0-logloss:0.25341
[750]	validation_0-logloss:0

In [18]:
#Train LightGBM model
lgb_pred = train(lgb)

[250]	valid_0's binary_logloss: 0.265241
[500]	valid_0's binary_logloss: 0.253859
[750]	valid_0's binary_logloss: 0.253377
Log loss for 1: 0.2533572813268959
[250]	valid_0's binary_logloss: 0.266122
[500]	valid_0's binary_logloss: 0.25471
[750]	valid_0's binary_logloss: 0.253966
Log loss for 2: 0.25394878737305215
[250]	valid_0's binary_logloss: 0.266414
[500]	valid_0's binary_logloss: 0.254766
[750]	valid_0's binary_logloss: 0.254038
Log loss for 3: 0.2540358492588115
[250]	valid_0's binary_logloss: 0.264497
[500]	valid_0's binary_logloss: 0.252514
[750]	valid_0's binary_logloss: 0.251721
Log loss for 4: 0.25170799728443594
[250]	valid_0's binary_logloss: 0.265108
[500]	valid_0's binary_logloss: 0.253596
[750]	valid_0's binary_logloss: 0.252906
[1000]	valid_0's binary_logloss: 0.252844
Log loss for 5: 0.25280949433577143
The Mean Log_loss eror: 0.25317188191579343


In [22]:
#Define training loop
fold=0
scores,pp=[],[]
estimator=cat_model
for train_index, test_index in skf.split(X_train,y_train):
    fold+=1
    print(f"================Fold:{fold}====================")
    xtrain, xtest = X_train.iloc[train_index],X_train.iloc[test_index]
    ytrain, ytest = y_train.iloc[train_index],y_train.iloc[test_index]
    model = estimator.fit(xtrain,ytrain,early_stopping_rounds=200,
                  eval_set=[(xtest,ytest)],verbose=250)
    prediction = model.predict_proba(xtest)
    predict_ = model.predict_proba(test_data)
    score = log_loss(ytest,prediction)
    print(f"Log loss for {fold}: {score}")
    scores.append(score)
    pp.append(predict_)

  
print(f"The Mean Log_loss eror: {np.mean(scores)}")##0.252703
cat_predict=pp

Learning rate set to 0.059247
0:	learn: 0.6059764	test: 0.6057445	best: 0.6057445 (0)	total: 207ms	remaining: 17m 13s
250:	learn: 0.2490748	test: 0.2524931	best: 0.2524891 (247)	total: 18s	remaining: 5m 39s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.2524746059
bestIteration = 290

Shrink model to first 291 iterations.
Log loss for 1: 0.2524746058994139
Learning rate set to 0.059247
0:	learn: 0.6061231	test: 0.6064137	best: 0.6064137 (0)	total: 63.9ms	remaining: 5m 19s
250:	learn: 0.2490118	test: 0.2530929	best: 0.2530709 (226)	total: 17.1s	remaining: 5m 23s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.2530708805
bestIteration = 226

Shrink model to first 227 iterations.
Log loss for 2: 0.25307088046367776
Learning rate set to 0.059247
0:	learn: 0.6057300	test: 0.6058723	best: 0.6058723 (0)	total: 62.1ms	remaining: 5m 10s
250:	learn: 0.2490444	test: 0.2530026	best: 0.2529925 (244)	total: 17.3s	remaining: 5m 27s
500:	learn: 0.2458821	test

In [24]:
#Aggregate predictions for all three models
cat_pred = np.mean(cat_predict,0)[:,1]
lgb_pred = np.mean(lgb_pred,0)[:,1]
xgb_pred = np.mean(xgb_pred,0)[:,1]

In [25]:
#For better perfromance a blend of all three model's prediction is required.
#Weights (e.g 0.6) are assigned on the basis of individual perfromance.
predictions=(((0.6*cat_pred) + (0.3*lgb_pred) + (0.1*cat_pred)) + ((0.45*cat_pred) + (0.55*lgb_pred)))/2

In [26]:
#ubmit predictions
ss= pd.read_csv("DSN_PreBootcamp_Hackathon/sample_submission.csv")
ss['CHURN'] = predictions
ss.to_csv("Submission_lgb.csv",index=False)