# Introduction: Modeling

In this notebook, we will use the calculated feature matrices to train and test a machine learning model. Our objective is a model that can predict, on the first of the month, which customers will churn during the next 30 days.

In [1]:
import pandas as pd
import numpy as np

PARTITION_DIR = 's3://customer-churn-spark/partitions'

In [2]:
p0_fm = pd.read_csv(f'{PARTITION_DIR}/p0/feature_matrix.csv')
p0_fm.head()

Unnamed: 0,msno,city,bd,registered_via,gender,SUM(logs.num_25),SUM(logs.num_50),SUM(logs.num_75),SUM(logs.num_985),SUM(logs.num_100),...,WEEKEND(LAST(transactions.transaction_date)),WEEKEND(LAST(transactions.membership_expire_date)),DAY(LAST(logs.date)),DAY(LAST(transactions.transaction_date)),DAY(LAST(transactions.membership_expire_date)),MONTH(LAST(logs.date)),MONTH(LAST(transactions.transaction_date)),MONTH(LAST(transactions.membership_expire_date)),churn,days_to_next_churn
0,+9zx0+mA3IZQLyjmU88qbfqJ0q9okIfYZnDI6FqaN2o=,1.0,0.0,7.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,,,,,,,0.0,
1,+sZCvwt5NmFw4uE185pBid4cOxtXTHovIyPFqchulQg=,1.0,0.0,7.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,,1.0,1.0,,1.0,2.0,0.0,394.0
2,+wzmLe86mMBeoIYoPedlt24WVTW6tabsRcaz81ZXBx0=,1.0,0.0,7.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,,,,,,,0.0,
3,/9+HJnqEryBbuH598zKqa8zb1Eypy927imqI9IWhJTk=,8.0,29.0,9.0,male,3.0,0.0,0.0,0.0,37.0,...,0.0,0.0,1.0,,,1.0,,,0.0,487.0
4,/BAK3DkUpoUESh4t8qlWs16yop+sG3i3oPYDpv5uGI0=,13.0,21.0,9.0,male,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,,,,,,,0.0,


In [3]:
p0_fm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32146 entries, 0 to 32145
Columns: 233 entries, msno to days_to_next_churn
dtypes: float64(211), object(22)
memory usage: 57.1+ MB


In [3]:
import findspark
findspark.init('/usr/local/spark-2.3.1-bin-hadoop2.7/')
import pyspark

In [5]:
sc = pyspark.SparkContext(master = 'spark://ip-172-31-23-133.ec2.internal:7077', appName = 'retrieval')
sc

In [4]:
def retrieve_data(partition_num):
    return pd.read_csv(f'{PARTITION_DIR}/p{partition_num}/feature_matrix.csv', low_memory = False)

In [9]:
import random

N_PARTITIONS = 1000
partitions = list(range(N_PARTITIONS))

In [30]:
random.seed(50)

# Set number of train and testing feature matrices
fms_to_get = 10
test_fms_to_get = 5

# Choose random sample of partitions
ps = random.sample(partitions, fms_to_get + test_fms_to_get)

# Separate into training and testing
test_p = ps[:test_fms_to_get]
train_p = ps[test_fms_to_get:]


train_fms = []

for i, p in enumerate(train_p):
    print(f'{round(100 * (i / fms_to_get), 2)}% complete.', end = '\r')
    train_fms.append(retrieve_data(p))
    
feature_matrix = pd.concat(fms)
feature_matrix.shape

90.0% complete.

(330471, 233)

In [31]:
test_fms = []

for i, p in enumerate(test_p):
    print(f'{round(100 * (i / test_fms_to_get), 2)}% complete.', end = '\r')
    test_fms.append(retrieve_data(p))
    
test_feature_matrix = pd.concat(test_fms)
test_feature_matrix.shape

80.0% complete.

(162972, 233)

In [32]:
feature_matrix = pd.get_dummies(feature_matrix.set_index('msno'))
test_feature_matrix = pd.get_dummies(test_feature_matrix.set_index('msno'))
feature_matrix, test_feature_matrix = feature_matrix.align(test_feature_matrix, join = 'inner', axis = 1)
feature_matrix.shape

(330471, 288)

In [33]:
y, test_y = feature_matrix.pop('churn'), test_feature_matrix.pop('churn')
y_reg, test_y_reg = feature_matrix.pop('days_to_next_churn'), test_feature_matrix.pop('days_to_next_churn')

In [56]:
feature_matrix = feature_matrix.replace({np.inf: np.nan, -np.inf: np.nan}).\
                                fillna(feature_matrix.median()).fillna(0)

In [57]:
test_feature_matrix = test_feature_matrix.replace({np.inf: np.nan, -np.inf: np.nan}).\
                                          fillna(feature_matrix.median()).fillna(0)

In [58]:
np.any(feature_matrix.isnull())

False

In [59]:
np.any(np.isinf(feature_matrix))

False

In [60]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(feature_matrix.values, np.array(y))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [61]:
model.score(test_feature_matrix, np.array(test_y))

0.9719215570772893

In [63]:
from sklearn.metrics import roc_auc_score

p = model.predict_proba(test_feature_matrix)[:, 1]
roc_auc_score(np.array(test_y), p)

0.5413001321080797

In [65]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators = 100, max_depth = 20, n_jobs = -1)
model.fit(feature_matrix.values, np.array(y))

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [66]:
p = model.predict_proba(test_feature_matrix)[:, 1]
roc_auc_score(np.array(test_y), p)

0.9889692804315062

In [68]:
fi = pd.DataFrame({'importance': model.feature_importances_}, index = feature_matrix.columns).\
sort_values('importance', ascending = False)
fi.head()

Unnamed: 0,importance
TIME_SINCE_LAST(transactions.transaction_date),0.120057
TIME_SINCE_LAST(logs.date),0.101861
MONTH(LAST(logs.date)),0.024474
LAST(logs.MONTH(date)),0.018325
DAY(LAST(logs.date)),0.016997


In [70]:
model = RandomForestClassifier(n_estimators = 100, max_depth = 20, n_jobs = -1)
model.fit(feature_matrix[['TIME_SINCE_LAST(transactions.transaction_date)',
                          'TIME_SINCE_LAST(logs.date)']], np.array(y))

p = model.predict_proba(test_feature_matrix[['TIME_SINCE_LAST(transactions.transaction_date)',
                          'TIME_SINCE_LAST(logs.date)']])[:, 1]
roc_auc_score(np.array(test_y), p)

0.9296105339633508

In [62]:
1 - np.mean(test_y)

0.9815551137618732

In [50]:
np.array(y)

array([0., 0., 0., ..., 0., 0., 0.])

In [53]:
np.where(np.isinf(feature_matrix))

(array([  1911,   1911,   1911, ..., 330470, 330470, 330470]),
 array([104, 123, 144, ..., 251, 252, 254]))

In [55]:
feature_matrix.iloc[:5, 104]

msno
+1EE5ENZKseg97UEcVSOEgz+PSLZDwdy7SQGL7DJXfg=    0.000000
+8wyPWUzU3P1K3l+0CcnSeh+LvMZhoERoY91YqxuK/k=    0.000000
+Fsq06gzLyf44RHR1/jgAYv+IMKMP2hYCFwYvUX0+LQ=    4.966667
+egyQhDfvK1PvkbHy7qEtlHPekvYwIog4YWc/POLorA=    0.000000
+mCpFmfAW2h213mhlSxpAE3HUMM8jNsH5NBR8g//DZw=    0.000000
Name: LAST(transactions.daily_price), dtype: float64

In [52]:
feature_matrix = np.nan_to_num(feature_matrix.values)

  """Entry point for launching an IPython kernel.


AttributeError: can't set attribute

In [51]:
np.all(np.isfinite(feature_matrix))

False

In [9]:
from timeit import default_timer as timer

start = timer()
fms = sc.parallelize(partitions, numSlices=1000).map(retrieve_data).collect()
end = timer()
sc.stop()

len(fms)

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 8 in stage 0.0 failed 4 times, most recent failure: Lost task 8.3 in stage 0.0 (TID 50, 172.31.23.133, executor 1): ExecutorLostFailure (executor 1 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1602)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1590)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1589)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1589)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1823)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1772)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1761)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2034)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2055)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2074)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2099)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:939)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:938)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:162)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:745)
