In [1]:
import sys
import time
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

In [2]:
import pandas as pd
import pickle
import numpy as np
from lifelines import CoxPHFitter

In [3]:
import pickle
with open('data/learned_cox_model.pkl','rb') as f:
  cf = pickle.load(f)

In [4]:
# There already exists 'sc' as SparkContext
#sc = SparkContext("local", appName="CalculateNBA")
ssc = StreamingContext(sc, 1)

In [5]:
# Load some data for testing
data = pd.read_csv("data/merged_dataset_python.csv", nrows=10000)
data.shape

(10000, 94)

In [6]:
# Exclude all fields except mentioned here
usefields = ["is_maried", "age", "work_experience", "children_number", "has_mortgage",
            "loan_products_nii", "checking_account_balance", "deposit_balance", "money_market_balance",
            "investable_asset_indicator", "Sentiment", "Sentiment_days_passed"]
data = data[usefields]
data["is_maried"] = data["is_maried"].astype(int)
#save ID for future references 
data["id"]=data.index+1
data.shape

(10000, 13)

In [7]:
rddQueue = []
for i in range(0, len(data), 100):
    rddQueue += [sqlContext.createDataFrame(data[i:i+100]).rdd]
len(rddQueue)

100

In [8]:
def predict_churn(row):
    ordered_list = []
    for i in ["is_maried", "age", "work_experience", "children_number", "has_mortgage",
                "loan_products_nii", "checking_account_balance", "deposit_balance", "money_market_balance",
                "investable_asset_indicator", "Sentiment", "Sentiment_days_passed"]:
        ordered_list.append(row[i])
    predict = cf.predict_survival_function(np.array(ordered_list).reshape(1,-1))
    res = pd.DataFrame()
    for i in [30, 60, 90, 120, 150, 180, 210, 240, 270, 300, 330, 365]:
        prob_at_day = predict[:i].tail(1)
        prob_at_day.index = [i]
        res = pd.concat([res, prob_at_day])
    # Define Final churn rate as 
    Final_churn = ((1 - res.loc[365]) > 0.4).astype(int)
    Final_churn.name = "Predicted churn"
    res = res.append(Final_churn)
    
    return np.append(res.values, row["id"])
    #return (row["id"], res.values)

In [9]:
# Create the QueueInputDStream and use it do some processing
inputStream = ssc.queueStream(rddQueue)

In [10]:
mappedStream = inputStream.map(lambda x: predict_churn(x))
#reducedStream = mappedStream.reduceByKey(lambda a, b: a + b)
#reducedStream.pprint()
mappedStream.pprint()
#mappedStream.saveAsTextFiles("hdfs:///tmp/prediction_churn.txt")
ssc.start()
time.sleep(10)

-------------------------------------------
Time: 2016-03-14 05:46:17
-------------------------------------------
[  1.00000000e+00   1.00000000e+00   1.00000000e+00   9.99999984e-01
   9.99998924e-01   9.99880061e-01   9.90541138e-01   4.10782946e-01
   1.63551267e-07   2.86001367e-13   2.86001367e-13   2.86001367e-13
   1.00000000e+00   1.00000000e+00]
[ 1.          1.          1.          1.          1.          0.99999998
  0.99999829  0.99984037  0.99720011  0.99483094  0.99483094  0.99483094
  0.          2.        ]
[ 1.          1.          1.          1.          1.          0.9999995
  0.99996054  0.99631243  0.93717368  0.88697888  0.88697888  0.88697888
  0.          3.        ]
[ 1.          1.          1.          1.          1.          0.99999983
  0.99998651  0.99873826  0.97806939  0.95984177  0.95984177  0.95984177
  0.          4.        ]
[  1.00000000e+00   1.00000000e+00   1.00000000e+00   9.99999070e-01
   9.99938159e-01   9.93128757e-01   5.79078448e-01   6.152

In [11]:
ssc.stop(stopSparkContext=False, stopGraceFully=True)

-------------------------------------------
Time: 2016-03-14 05:46:22
-------------------------------------------
[   1.            1.            1.            1.            1.
    0.99999985    0.99998844    0.99891865    0.98117671    0.9654858
    0.9654858     0.9654858     0.          501.        ]
[   1.            1.            1.            1.            1.
    0.99999999    0.99999933    0.99993742    0.99890148    0.99797049
    0.99797049    0.99797049    0.          502.        ]
[   1.            1.            1.            1.            1.
    0.99999999    0.99999899    0.99990535    0.99833896    0.99693196
    0.99693196    0.99693196    0.          503.        ]
[   1.            1.            1.            1.            1.            1.
    0.99999991    0.99999172    0.99985457    0.99973122    0.99973122
    0.99973122    0.          504.        ]
[   1.            1.            1.            1.            0.99999999
    0.99999907    0.99992661    0.99315259    0.