In [1]:
import numpy as np
import pandas as pd

import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

In [8]:
Train = pd.read_csv("data/Train.csv")
Test = pd.read_csv("data/Test.csv")

In [9]:
#what features are categorical?
categorical_features = Train.select_dtypes(include=['object','category']).columns.tolist()

#what features are numerical?
numerical_features = Train.select_dtypes(include=['int64','float64']).columns.tolist()
print("categorical features", categorical_features)
print()
print("numerical features",numerical_features)

categorical features ['user_id', 'REGION', 'TENURE', 'MRG', 'TOP_PACK']

numerical features ['MONTANT', 'FREQUENCE_RECH', 'REVENUE', 'ARPU_SEGMENT', 'FREQUENCE', 'DATA_VOLUME', 'ON_NET', 'ORANGE', 'TIGO', 'ZONE1', 'ZONE2', 'REGULARITY', 'FREQ_TOP_PACK', 'CHURN']


In [10]:
defective_features = ['user_id', 'MRG',"TOP_PACK"]
Train.drop(defective_features, 1, inplace = True)
Test.drop(defective_features, 1, inplace = True)

In [11]:
ntrain = Train.shape[0]
ntest = Test.shape[0]
data = pd.concat((Train, Test)).reset_index(drop=True)

In [12]:
#AVERAGE OF THE TENURE BOUNDARIES
data['TENURE_avg'] = data['TENURE'].map({'K > 24 month': (24+27)/2, 'I 18-21 month':(18+21)/2 , 'H 15-18 month': (15+18)/2, 'G 12-15 month':(12+15)/2,
                                             'J 21-24 month': (21+24)/2, 'F 9-12': (9+12)/2, 'E 6-9 month':(6+9)/2, 'D 3-6 month':(3+6)/2})

data['TENURE'] = data['TENURE'].map({'K > 24 month': 24, 'I 18-21 month': 18, 'H 15-18 month': 15, 'G 12-15 month':12,
                                             'J 21-24 month': 21, 'F 9-12': 9, 'E 6-9 month':6, 'D 3-6 month':3})


In [13]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
data['REGION_encoded']=le.fit_transform(data['REGION'])
data.drop(['REGION'],1,inplace=True)

In [18]:
data['Total_income'] = data['REVENUE'] * data['FREQUENCE']
data['FREQ_PACK'] = data.FREQUENCE_RECH/data.FREQ_TOP_PACK
data['diff//freq'] = (data['MONTANT'] - data['FREQUENCE_RECH']) / data['FREQUENCE']
data['NOT_FREQUENCE_RECH'] = data['FREQUENCE_RECH'] - data['FREQ_TOP_PACK']

In [19]:
data['diff_Orange'] = np.abs(data['ON_NET']-data['ORANGE'])
data['diff_Tigo'] = np.abs(data['ON_NET']-data['TIGO'])
data['freq//rech'] = data['FREQUENCE'] / data['FREQUENCE_RECH']
data['freq//montant'] =  data['MONTANT']/ data['FREQUENCE']
data['freq//revenue'] = data['FREQUENCE'] / data['REVENUE']


In [20]:
data['segment/reg'] = data['ARPU_SEGMENT'] / data['REGULARITY']
data['net//reg'] = data['ON_NET'] / data['REGULARITY']
data['data//reg'] = data['DATA_VOLUME'] / data['REGULARITY']

In [21]:
train_data = data[:ntrain]
test_data = data[ntrain:]
target = train_data['CHURN']
train_data.drop(["CHURN"],1,inplace=True)
test_data.drop(["CHURN"],1,inplace=True)

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

In [23]:
X_train, X_test, y_train, y_test = train_test_split(train_data, target, test_size=0.33, stratify=target,random_state=56)
skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=23)

In [28]:
from sklearn.metrics import log_loss,roc_auc_score
import lightgbm as lgbm
import xgboost as xgb
from catboost import CatBoostClassifier

In [25]:
fold=0
scores=[]
for train_index, test_index in skf.split(X_train,y_train):
    fold+=1
    print(f"================Fold:{fold}====================")
    xtrain, xtest = X_train.iloc[train_index],X_train.iloc[test_index]
    ytrain, ytest = y_train.iloc[train_index],y_train.iloc[test_index]
    xgb_model = xgb.XGBClassifier(n_jobs=-1,random_state=23,objective='binary:logistic',
                n_estimators=2500,learning_rate=0.01,
                colsample_bytree=0.9,subsample=1, use_label_encoder=False)
    xgb_model.fit(xtrain,ytrain,early_stopping_rounds=200,
                  eval_set=[(xtest,ytest)],verbose=250)
    prediction = xgb_model.predict_proba(xtest)
    score = log_loss(ytest,prediction)
    print(f"Log loss for {fold}: {score}")
    scores.append(score)

    
  #Baseline Mean: 0.25479403  
print(f"The Mean Log_loss eror: {np.mean(scores)}")##0.252703

[0]	validation_0-logloss:0.68646
[250]	validation_0-logloss:0.27266
[500]	validation_0-logloss:0.25382
[750]	validation_0-logloss:0.25292
[976]	validation_0-logloss:0.25296
Log loss for 1: 0.25290475368811427
[0]	validation_0-logloss:0.68648
[250]	validation_0-logloss:0.27353


KeyboardInterrupt: 

In [34]:
fold=0
scores,pp=[],[]
for train_index, test_index in skf.split(X_train,y_train):
    fold+=1
    print(f"================Fold:{fold}====================")
    xtrain, xtest = X_train.iloc[train_index],X_train.iloc[test_index]
    ytrain, ytest = y_train.iloc[train_index],y_train.iloc[test_index]
    cat_model = CatBoostClassifier(random_seed=34,use_best_model=True,
                          n_estimators=5000,silent=True,eval_metric='Logloss')
    cat_model.fit(xtrain,ytrain,eval_set=[(xtest,ytest)],early_stopping_rounds=200,
                           verbose=250,use_best_model=True)
    prediction = cat_model.predict_proba(xtest)
    predict_ = cat_model.predict_proba(test_data)
    score = log_loss(ytest,prediction)
    print(f"Log loss for {fold}: {score}")
    scores.append(score)
    pp.append(predict_)

    
print(f"The Mean Log_loss eror: {np.mean(scores)}")#0.252346503

Learning rate set to 0.059247
0:	learn: 0.6051587	test: 0.6049709	best: 0.6049709 (0)	total: 69.4ms	remaining: 5m 47s
250:	learn: 0.2490721	test: 0.2525302	best: 0.2525290 (243)	total: 22.5s	remaining: 7m 5s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.2525037667
bestIteration = 283

Shrink model to first 284 iterations.
Log loss for 1: 0.2525037667456006
Learning rate set to 0.059247
0:	learn: 0.6054821	test: 0.6058315	best: 0.6058315 (0)	total: 195ms	remaining: 16m 15s
250:	learn: 0.2488988	test: 0.2530742	best: 0.2530736 (246)	total: 22.7s	remaining: 7m 10s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.2530611516
bestIteration = 258

Shrink model to first 259 iterations.
Log loss for 2: 0.25306115155013154
Learning rate set to 0.059247
0:	learn: 0.6051928	test: 0.6052670	best: 0.6052670 (0)	total: 74.9ms	remaining: 6m 14s
250:	learn: 0.2490243	test: 0.2530962	best: 0.2530962 (250)	total: 21.2s	remaining: 6m 40s
500:	learn: 0.2459040	tes

In [32]:
cat_model = CatBoostClassifier(random_seed=34,use_best_model=True,
                          n_estimators=5000,silent=True,eval_metric='Logloss')
cat_model.fit(X_train,y_train,eval_set=[(X_test,y_test)],early_stopping_rounds=200,
                       verbose=250,use_best_model=True)
predictions_ = cat_model.predict_proba(test_data)

Learning rate set to 0.062605
0:	learn: 0.6006370	test: 0.6005943	best: 0.6005943 (0)	total: 88.5ms	remaining: 7m 22s
250:	learn: 0.2494260	test: 0.2524416	best: 0.2524413 (249)	total: 28.2s	remaining: 8m 53s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.2524075021
bestIteration = 281

Shrink model to first 282 iterations.


In [42]:
ss= pd.read_csv("DSN_PreBootcamp_Hackathon/sample_submission.csv")
ss['CHURN'] = np.mean(pp,0)[:,1]
ss.to_csv("Submission_cat2.csv",index=False)

In [19]:
ss= pd.read_csv("DSN_PreBootcamp_Hackathon/sample_submission.csv")
ss['CHURN'] = preds[:,1]
ss.to_csv("Submission_cat.csv",index=False)

In [53]:
from skopt import BayesSearchCV

array([325156,  74844], dtype=int64)