Churn prediction
-------------------
Company churn occurs when companies cancel subscriptions or subscriptions expires without renewals.  
This file uses LogisticRegression to predict Churn. It also uses SMOTE for imbalanced data, and RFE for feature selection.

In [2]:
%run /Users/bxing@actsoft.com/churn_dataframe_sandbox

In [3]:
print(df.shape)
print(list(df.columns))

In [4]:
df = df[categorical_features + [target] + numerical_features]
print(list(df.columns))

In [5]:
df.groupby('Tier').mean()

Unnamed: 0_level_0,IsSetupComplete,UserIntegrationType,Churn,Licenses,Duration
Tier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5,0.777778,0.0,0.555556,92.0,442.866667
10,0.35709,0.002204,0.525839,224.385746,322.771736
20,0.563656,0.0,0.5963,37.048966,343.851469
30,0.492151,0.002355,0.589482,51.147567,362.547096


In [6]:
df.groupby('IsSetupComplete').mean()

Unnamed: 0_level_0,Tier,UserIntegrationType,Churn,Licenses,Duration
IsSetupComplete,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
False,15.11998,0.001469,0.525465,31.370225,292.67238
True,17.20057,0.001901,0.593473,293.234791,391.766793


In [7]:
df.groupby('UserIntegrationType').mean()

Unnamed: 0_level_0,Tier,IsSetupComplete,Churn,Licenses,Duration
UserIntegrationType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,16.027501,0.435876,0.555003,145.52225,335.800304
3,15.0,0.5,0.75,141.5,459.75


In [8]:
count_no_churn = len(df[df['Churn']==0])
count_churn = len(df[df['Churn']==1])
pct_of_no_churn = count_no_churn/(count_no_churn+count_churn)
print("percentage of no churn is", pct_of_no_churn*100)
pct_of_churn = count_churn/(count_no_churn+count_churn)
print("percentage of churn", pct_of_churn*100)

Create dummy variables for categorical variables

In [10]:
cat_vars=['Tier','IsSetupComplete','UserIntegrationType']
for var in cat_vars:
    cat_list='var'+'_'+var
    cat_list = pd.get_dummies(df[var], prefix=var)
    df1=df.join(cat_list)
    df=df1
    
df_vars=df.columns.values.tolist()
to_keep=[i for i in df_vars if i not in cat_vars]
df_final=df[to_keep]
df_final.columns.values

In [11]:
df_final.head()

Unnamed: 0,Churn,Licenses,Duration,Tier_5,Tier_10,Tier_20,Tier_30,IsSetupComplete_False,IsSetupComplete_True,UserIntegrationType_0,UserIntegrationType_3
0,0,28,1077,0,0,1,0,0,1,1,0
1,1,6,191,0,0,1,0,0,1,1,0
2,1,13,219,0,0,0,1,0,1,1,0
3,0,7,935,0,1,0,0,0,1,1,0
4,1,10,92,0,0,0,1,0,1,1,0


Let's scale the data before doing RFE and Fitting.

In [13]:
from sklearn.preprocessing import StandardScaler

scaled_features = StandardScaler().fit_transform(df_final.values[:, [1,2]])
scaled_features_df = pd.DataFrame(scaled_features, index=df_final.index, columns=['Licenses', 'Duration'])

# df_final['Licenses'] = scaled_features_df['Licenses']
# df_final['Duration'] = scaled_features_df['Duration']
df_final.update(scaled_features_df)

df_final.head()

Unnamed: 0,Churn,Licenses,Duration,Tier_5,Tier_10,Tier_20,Tier_30,IsSetupComplete_False,IsSetupComplete_True,UserIntegrationType_0,UserIntegrationType_3
0,0,-0.046574,2.770786,0,0,1,0,0,1,1,0
1,1,-0.055293,-0.541605,0,0,1,0,0,1,1,0
2,1,-0.052518,-0.436925,0,0,0,1,0,1,1,0
3,0,-0.054896,2.239907,0,1,0,0,0,1,1,0
4,1,-0.053707,-0.911726,0,0,0,1,0,1,1,0


Data looks like unbalanced. Let's perform **Over-sampling using SMOTE**.

In [15]:
from sklearn import preprocessing

X = df_final.loc[:, df_final.columns != 'Churn']
y = df_final.loc[:, df_final.columns == 'Churn']

# whether over_sampling or not
sampling = True

In [16]:
from imblearn.over_sampling import SMOTE

os = SMOTE(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
columns = X_train.columns

if sampling:
  os_data_X,os_data_y=os.fit_sample(X_train, y_train)
  os_data_X = pd.DataFrame(data=os_data_X,columns=columns )
  os_data_y= pd.DataFrame(data=os_data_y,columns=['Churn'])
else:
  os_data_X = X_train
  os_data_y = y_train

# we can Check the numbers of our data
print("length of oversampled data is ",len(os_data_X))
print("Number of no churn in oversampled data",len(os_data_y[os_data_y['Churn']==0]))
print("Number of churn",len(os_data_y[os_data_y['Churn']==1]))
print("Proportion of no churn data in oversampled data is ",len(os_data_y[os_data_y['Churn']==0])/len(os_data_X))
print("Proportion of churn data in oversampled data is ",len(os_data_y[os_data_y['Churn']==1])/len(os_data_X))

Recursive feature elimination

In [18]:
from sklearn import datasets
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()

rfe = RFE(logreg, n_features_to_select=20)
rfe = rfe.fit(os_data_X, os_data_y.values.ravel())
print(rfe.support_)
print(rfe.ranking_)

RFE doesn't rule out any features. So we keep all.

In [20]:
X=os_data_X
y=os_data_y['Churn']

print(X.columns.values)

Implement the model. Using Logit to analyze p-values, and remove any columns that have high p-values.  
In this case, it yields overflow error. So we proceed with all original columns.

In [22]:
import statsmodels.api as sm

logit_model=sm.Logit(y,X)
result=logit_model.fit()
print(result.summary2())

Logistic Regression Model Fitting

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [25]:
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

**Testing**
- Confusion Matrix
- Classification Report
- AOC Curve

In [27]:
from sklearn.metrics import confusion_matrix

confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

In [28]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

In [29]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()