Churn prediction
-------------------
Company churn occurs when companies cancel subscriptions or subscriptions expires without renewals.   
This file uses LogisticRegression to predict Churn. It uses Pipeline to combine operations such as Categrical feature encoding and Numerical feature scaling.

In [2]:
%run /Users/bxing@actsoft.com/churn_dataframe_sandbox

In [3]:
print(df.shape)
print(list(df.columns))

In [4]:
df = df[categorical_features + [target] + numerical_features]
print(list(df.columns))

In [5]:
df.groupby('Tier').mean()

Unnamed: 0_level_0,IsSetupComplete,UserIntegrationType,Churn,Licenses,Duration
Tier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5,0.777778,0.0,0.555556,92.0,442.866667
10,0.35709,0.002204,0.525839,224.385746,322.771736
20,0.563656,0.0,0.5963,37.048966,343.851469
30,0.492151,0.002355,0.589482,51.147567,362.547096


In [6]:
df.groupby('IsSetupComplete').mean()

Unnamed: 0_level_0,Tier,UserIntegrationType,Churn,Licenses,Duration
IsSetupComplete,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
False,15.11998,0.001469,0.525465,31.370225,292.67238
True,17.20057,0.001901,0.593473,293.234791,391.766793


In [7]:
df.groupby('UserIntegrationType').mean()

Unnamed: 0_level_0,Tier,IsSetupComplete,Churn,Licenses,Duration
UserIntegrationType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,16.027501,0.435876,0.555003,145.52225,335.800304
3,15.0,0.5,0.75,141.5,459.75


In [8]:
count_no_churn = len(df[df['Churn']==0])
count_churn = len(df[df['Churn']==1])
pct_of_no_churn = count_no_churn/(count_no_churn+count_churn)
print("percentage of no churn is", pct_of_no_churn*100)
pct_of_churn = count_churn/(count_no_churn+count_churn)
print("percentage of churn", pct_of_churn*100)

Logistic Regression Model Fitting

In [10]:
class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, df):
        return df[self.key]

pipeline = Pipeline(
    [
        (
            "union",
            FeatureUnion(
                transformer_list=[
                    (
                        "categorical_features",
                        Pipeline(
                            [
                                ("selector", ItemSelector(key=categorical_features)),
                                ("onehot", OneHotEncoder()),
                            ]
                        ),
                    )
                ]
                + [
                    (
                        "numerical_features",
                        Pipeline(
                            [
                                ("selector", ItemSelector(key=numerical_features)),
                                ("scaler", StandardScaler()),
                            ]
                        ),
                    )
                ]
            ),
        ),
        ("classifier", LogisticRegression()),
    ]
)

In [11]:
df_train, df_test = train_test_split(df, test_size=0.25, random_state=42)
pipeline.fit(df_train, df_train[target])

In [12]:
y_pred = pipeline.predict(df_test)
y_test = df_test[target]
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(pipeline.score(df_test, y_test)))

**Testing**
- Confusion Matrix
- Classification Report
- AOC Curve

In [14]:
from sklearn.metrics import confusion_matrix

confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

In [15]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

In [16]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

logit_roc_auc = roc_auc_score(y_test, pipeline.predict(df_test))
fpr, tpr, thresholds = roc_curve(y_test, pipeline.predict_proba(df_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()