In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [None]:
from sklearn.model_selection import train_test_split,cross_val_score, KFold
from sklearn.metrics import accuracy_score,confusion_matrix,recall_score,precision_recall_curve, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier, VotingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier

In [None]:
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier, XGBRFClassifier
from catboost import CatBoostClassifier

In [None]:
train = pd.read_csv('../input/expresso-churn-prediction-challenge/Train.csv')
test = pd.read_csv('../input/expresso-churn-prediction-challenge/Test.csv')
submission = pd.read_csv('../input/expresso-churn-prediction-challenge/SampleSubmission.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.shape, test.shape

In [None]:
train.info()

In [None]:
train.nunique()

In [None]:
train.columns

In [None]:
(train.isna().sum()/ 2154048) * 100

In [None]:
test.head()

In [None]:
train.REGION.dtype

In [None]:
train.REGION.fillna('missing').value_counts(normalize=True)

In [None]:
train.REGION.map(train.REGION.fillna('missing').value_counts()/len(train))

In [None]:
test.info()

In [None]:
(test.isna().sum()/380127 ) * 100

train = train.fillna(-999)
test = test.fillna(-999)

In [None]:
train.isna().sum()

In [None]:
test.isna().sum()

In [None]:
train.columns

In [None]:
drop_col = ['user_id']

In [None]:
new_train = train.drop(drop_col, axis=1)

In [None]:
new_test= test.drop(drop_col, axis=1)

In [None]:
target = new_train.CHURN

In [None]:
concated= pd.concat([new_train.drop('CHURN',axis=  1), new_test]).reset_index(drop = True)

In [None]:
concated

In [None]:
for column in concated.select_dtypes('O').columns:
    concated[column] = concated[column].fillna('missing')
    concated[column]  = concated[column].map(concated[column].value_counts(normalize = True))

In [None]:
concated

concated= pd.get_dummies(concated)

In [None]:
ntrain = train.shape[0]

In [None]:
new_train_dummied= concated.iloc[:ntrain]
new_test_dummied= concated.iloc[ntrain:]

In [None]:
new_test_dummied.head()

In [None]:
cols = new_train_dummied.columns.difference(['CHURN'])
X= new_train_dummied[cols]

#y= new_train_dummied['CHURN']

In [None]:
X.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,target, test_size= 0.3, random_state= 42)

In [None]:
X.columns

X

In [None]:
X.shape 

In [None]:

%%time
scaler = StandardScaler()

X_train[X.columns] = scaler.fit_transform(X_train[X.columns])
X_test[X.columns] = scaler.fit_transform(X_test[X.columns])

new_test_dummied[X.columns] = scaler.transform(new_test_dummied[X.columns])

# Logistic Regression

In [None]:
lr = LogisticRegression()
lr

In [None]:
%%time
lr.fit(X_train,y_train)

In [None]:
%%time
lr_y_pred = lr.predict_proba(X_test)[:,1]

In [None]:
print("roc_auc_score")
roc_auc_score(y_test, lr_y_pred)

In [None]:
test_pred_lr= lr.predict(new_test_dummied[X.columns])

# Random Forest

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42,n_jobs=-1)

In [None]:
%%time
rf.fit(X_train,y_train)

In [None]:
%%time
rf_y_pred = rf.predict(X_test)

In [None]:
print("roc_auc_score")
roc_auc_score(y_test, rf_y_pred)

In [None]:
test_pred_rf= rf.predict(new_test_dummied[X.columns])

In [None]:
test_pred_rf

# LGBM

In [None]:
lgb = LGBMClassifier( random_state=42,n_jobs=-1)

In [None]:
lgb.fit(X_train, y_train)

In [None]:
lgbb_pred = lgb.predict_proba(X_test)[:,1]

In [None]:
print("roc_auc_score")
roc_auc_score(y_test, lgbb_pred)

In [None]:
test_pred_lgbb= lgb.predict_proba(new_test_dummied[X.columns])[:,1]

> XGBOOST

In [None]:
from xgboost import XGBClassifier

In [None]:
xgb = XGBClassifier(random_state=42)

In [None]:
xgb.fit(X_train, y_train)

In [None]:
xgb_pred = xgb.predict_proba(X_test)[:,1]

In [None]:
print("roc_auc_score")
roc_auc_score(y_test, xgb_pred)

In [None]:
test_pred_xgb= xgb.predict_proba(new_test_dummied[X.columns])[:,1]

In [None]:
X.shape,new_test_dummied.shape

In [None]:
new_test_dummied.columns

In [None]:
X.columns

new_test_dummied = new_test_dummied.drop(['CHURN'], axis=1)

In [None]:
new_test_dummied

In [None]:
def cross_validation(X,y,test,model,folds = 5,random_state =  42):
    np.random.seed(42)
    kf = KFold(n_splits=folds,shuffle=True, random_state=random_state) 
    y_copy = y.copy()
    total_test_pred = []
    for train_index, test_index in kf.split(X):
        #print("Train:", train_index, "Validation:",test_index)
        X_train, X_test = X.iloc[train_index], X.iloc[test_index] 
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
        
        model.fit(X_train,y_train)
        
        val_pred  = model.predict_proba(X_test)[:,1]
        y_copy.iloc[test_index] = val_pred
        print('roc_auc_score',roc_auc_score(y_test,val_pred))
        test_pred = model.predict_proba(test)[:,1]
        total_test_pred.append(test_pred)
    total_test_pred = np.mean(total_test_pred,axis = 0)
    print('roc_auc_score', roc_auc_score(y,y_copy))
    return y_copy,total_test_pred

In [None]:
%%time
total_val_lgb, total_test_lgb = cross_validation(X,target,new_test_dummied, LGBMClassifier( random_state=42,n_jobs=-1),2)

In [None]:
345678

In [None]:
from catboost import CatBoostClassifier

In [None]:
cbc= CatBoostClassifier(iterations= 2000, random_state= 42)

In [None]:
cbc.fit(X,target, eval_set= (X,target), use_best_model= True)

In [None]:
cbc_pred = cbc.predict_proba(new_test_dummied)[:,1]

In [None]:
lgbm= LGBMClassifier(n_estimators=100,verbose = 10, random_state= 42)

In [None]:
lgbm.fit(X,target, )#eval_set= (X,target))

In [None]:
a = lgbm.predict_proba(new_train_dummied)[:,1]

In [None]:
roc_auc_score(target,a)

In [None]:
lgbm_pred = lgbm.predict_proba(new_test_dummied)[:,1]

In [None]:
xgbm= XGBClassifier( random_state= 42)

In [None]:
xgbm.fit(X,target)

In [None]:
xgbm_pred = xgbm.predict_proba(new_test_dummied)[:,1]

In [None]:
roc_auc_score(y,total_val_lgb)

In [None]:
y_train.head()

for col in X_train.columns:
    print(col)

In [None]:
%%time
lgb.fit(X_train.values,y_train.values)

X_test

In [None]:
%%time
lgb_y_pred = lgb.predict_proba(X_test.values)[:,1]

In [None]:
print("roc_auc_score")
roc_auc_score(y_test, lgb_y_pred)

In [None]:
print("roc_auc_score")
roc_auc_score(y_test, lgb_y_pred)

In [None]:
test_pred_lgb= lgb.predict_proba(new_test_dummied[X.columns])[:,1]

In [None]:
test_pred_lgb

In [None]:
test_pred[:5]

In [None]:
test.shape

In [None]:
new_test_dummied.shape

In [None]:
submission.shape

In [None]:
submission.head()

In [None]:
submission["CHURN"] = test_pred_xgb

In [None]:
submission.to_csv('xgb_train_test_split.csv', index=False)