In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings("ignore")

data=pd.read_excel('NewspaperChurn.xlsx')
age_dict=data['Age range'].value_counts().to_dict()
data['Age range'] = data['Age range'].map(age_dict)

In [2]:
income_dict=data['HH Income'].value_counts().to_dict()
data['HH Income'] = data['HH Income'].map(income_dict)

In [3]:
weekly_fee_dict=data['weekly fee'].value_counts().to_dict()
data['weekly fee'] = data['weekly fee'].map(weekly_fee_dict)

In [4]:
data['Deliveryperiod'] = data['Deliveryperiod'].map({'7Day':'SMTWTFS','SoooTFS':'SoooTFS','SatSun':'SoooooS','SooooFS':'SooooFS','oMTWTFo':'oMTWTFo','Mon-Fri':'oMTWTFo','Sun-FriT':'SooooFS','SooooFST':'SooooFS','Fri-SunT':'SooooFS','Sun-Fri':'SooooFS','Fri-Sun':'SooooFS','SoooooST':'SoooooS','SoooooS':'SoooooS','thu-sun':'SoooTFS','SoooTFST':'SoooTFS','Thu-SunT':'SoooTFS','THU-SUN':'SoooTFS','Thu-Sun':'SoooTFS','SUNONLY':'Soooooo','sunonly':'Soooooo','SooooooT':'Soooooo','SunOnlyT':'Soooooo','Soooooo':'Soooooo','7day':'SMTWTFS','7DAY':'SMTWTFS','7DayT':'SMTWTFS','7DayOL':'SMTWTFS','SunOnly':'Soooooo'})

In [5]:
data['Deliveryperiod'].value_counts()

Soooooo    6506
SMTWTFS    6044
SoooTFS    2755
SoooooS     311
SooooFS     227
oMTWTFo      12
Name: Deliveryperiod, dtype: int64

In [6]:
data['Language'].fillna('Missing',inplace=True)

In [7]:
data['weekly fee'].fillna(186,inplace=True)

In [8]:
data['Age range'].fillna(108,inplace=True)

In [9]:
data['Nielsen Prizm'].fillna('NA',inplace=True)

In [10]:
for feature in data.columns:
    print(feature,data[feature].nunique())

SubscriptionID 15855
HH Income 18
Home Ownership 2
Ethnicity 73
dummy for Children 2
Year Of Residence 56
Age range 13
Language 38
Address 15742
State 1
City 56
County 4
Zip Code 117
weekly fee 15
Deliveryperiod 6
Nielsen Prizm 10
reward program 116
Source Channel 51
Subscriber 2


In [11]:
data.drop(['SubscriptionID','Address','State'],axis=1,inplace=True)

In [12]:
data.columns=['Income', 'HomeOwnership', 'Ethnicity', 'DummyForChildren','YearOfResidence', 'AgeRange', 'Language', 'City', 'County', 'ZipCode', 'WeeklyFee', 'DeliveryPeriod', 'NielsenPrizm', 'RewardProgram', 'SourceChannel', 'Subscriber']

In [13]:
data['HomeOwnership']=data['HomeOwnership'].map({'RENTER':0,'OWNER':1})
data['DummyForChildren']=data['DummyForChildren'].map({'Y':1,'N':0})
data['Subscriber']=data['Subscriber'].map({'YES':1,'NO':0})

In [14]:
data['Subscriber'].value_counts()

0    12818
1     3037
Name: Subscriber, dtype: int64

In [15]:
le=LabelEncoder()
for feature in ['Ethnicity','Language','City','County','DeliveryPeriod','NielsenPrizm','SourceChannel']:
    data[feature]=le.fit_transform(data[feature])

In [16]:
data.head()

Unnamed: 0,Income,HomeOwnership,Ethnicity,DummyForChildren,YearOfResidence,AgeRange,Language,City,County,ZipCode,WeeklyFee,DeliveryPeriod,NielsenPrizm,RewardProgram,SourceChannel,Subscriber
0,916,0,23,0,1,856.0,12,32,0,90802,351.0,0,1,0,6,0
1,226,1,72,1,14,1913.0,23,38,1,92657,2950.0,4,5,0,38,1
2,1772,1,33,1,7,1878.0,7,21,1,92604,2950.0,4,5,0,38,1
3,514,1,17,0,23,1793.0,7,28,1,92677,2383.0,4,5,1,28,0
4,879,1,33,0,23,1463.0,7,42,1,92688,126.0,1,4,0,9,1


### Train Test Split

In [17]:
x_train,x_test,y_train,y_test=train_test_split(data.iloc[:,:-1],data['Subscriber'])

### Logistic Regression

In [18]:
lr_pipe = Pipeline(steps=[('lr_ss', StandardScaler()),('lr', LogisticRegression())])

lr_values = {'lr__C': [1.0]}       

grid_lr = GridSearchCV(lr_pipe, lr_values, cv=5, n_jobs = -1)
grid_lr.fit(x_train, y_train)
print('LogisticRegressionScore', grid_lr.best_score_)
print('LogisticRegressionParams', grid_lr.best_params_)

LogisticRegressionScore 0.8161636494827356
LogisticRegressionParams {'lr__C': 1.0}


### Support Vector Machines

In [19]:
svm_pipe = Pipeline(steps=[("svm_ss", StandardScaler()),("svm", SVC())])

svm_values = {
    'svm__kernel' : ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']
            }

grid_svm = GridSearchCV(svm_pipe, svm_values, cv=5, n_jobs = -1)
grid_svm.fit(x_train, y_train)
print('SVMScore', grid_svm.best_score_)
print('SVMParams', grid_svm.best_params_)

SVMScore 0.8200321286162812
SVMParams {'svm__kernel': 'rbf'}


### Naive Bayes

In [20]:
nb=GaussianNB()
grid_nb=cross_val_score(nb,x_train,y_train,scoring='accuracy',cv=5, n_jobs = -1)
print('NaiveBayes',np.mean(grid_nb))

NaiveBayes 0.8177611713935115


### Random Forest Classifier

In [21]:
rf = RandomForestClassifier(n_jobs=-1)

rf_values = {'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
             'min_samples_leaf': [1, 2, 3, 4, 5],
             'min_samples_split': [2, 5, 7, 9, 11],
             'max_features': ['auto', 'sqrt'],
             'n_estimators': [150, 250, 350, 450, 500, 550, 600, 650],            
              }
    
grid_rf = RandomizedSearchCV(rf, rf_values, cv=5, n_jobs = -1)
grid_rf.fit(x_train, y_train)
print('RFCScore', grid_rf.best_score_)
print('RFCParams', grid_rf.best_params_)

RFCScore 0.8453459288256404
RFCParams {'n_estimators': 250, 'min_samples_split': 2, 'min_samples_leaf': 3, 'max_features': 'auto', 'max_depth': 80}


### XGBoost

In [22]:
xg=XGBClassifier(n_jobs = -1)

xg_values={
    'colsample_bytree': [0.3,0.4,0.5,0.7],
    'gamma': [0.0,0.1,0.2,0.3,0.4],
    'learning_rate': [0.05,0.10,0.15,0.20,0.25,0.30],
    'max_depth': [3,4,5,6,7,8,10,12,15],
    'min_child_weight': [1,3,5,7],
    'n_estimators': [50,70,85,100,125,150,175,200]    
    }

grid_xg=RandomizedSearchCV(xg,xg_values,cv=10,n_jobs=-1)
grid_xg.fit(x_train, y_train)
print('XGBScore', grid_xg.best_score_)
print('XGBParams', grid_xg.best_params_)

XGBScore 0.852325660289347
XGBParams {'n_estimators': 50, 'min_child_weight': 1, 'max_depth': 7, 'learning_rate': 0.2, 'gamma': 0.3, 'colsample_bytree': 0.5}


### AdaBoost

In [23]:
ab = AdaBoostClassifier()

ab_values = {'n_estimators':[50,100,150,200,250,300,350,400,450,500,550,600],
            'learning_rate':[0.05,0.1,0.3,0.5]
              }
    
grid_ab = RandomizedSearchCV(ab, ab_values, cv=10, n_jobs = -1)
grid_ab.fit(x_train, y_train)
print('XGBScore', grid_ab.best_score_)
print('XGBParams', grid_ab.best_params_)

XGBScore 0.846102649638493
XGBParams {'n_estimators': 600, 'learning_rate': 0.5}


In [24]:
y_pred=grid_xg.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.88      0.95      0.91      3231
           1       0.67      0.42      0.52       733

    accuracy                           0.85      3964
   macro avg       0.77      0.69      0.71      3964
weighted avg       0.84      0.85      0.84      3964

