In [1]:
import sys
import time
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

In [2]:
import pandas as pd
import pickle
import numpy as np
from sklearn.ensemble import RandomForestClassifier

In [3]:
import pickle
with open("data/learned_rft_NBA_model.pkl","rb") as f:
  rfc = pickle.load(f)

In [4]:
# There already exists 'sc' as SparkContext
#sc = SparkContext("local", appName="CalculateNBA")
ssc = StreamingContext(sc, 1)

In [5]:
 #sqlc = pyspark.sql.SQLContext(sc)

In [6]:
# read some data
data = pd.read_csv("data/merged_dataset_python.csv", nrows=10000)
data.shape

(10000, 94)

In [7]:
# Exclude all fields except mentioned here
usefields = ["is_maried", "age", "work_experience", "children_number", "has_mortgage",
            "loan_products_nii", "checking_account_balance", "deposit_balance", "money_market_balance",
            "investable_asset_indicator", "Sentiment", "Sentiment_days_passed",
            "NBO_choosen"]
data = data[usefields]
data["is_maried"] = data["is_maried"].astype(int)
#save ID for future references 
data["id"]=data.index+1
data.shape

(10000, 14)

In [8]:
rddQueue = []
for i in range(0, len(data), 100):
    #rddQueue += [ssc.sparkContext.parallelize([j for j in range(1, 1001)], 10)]
    rddQueue += [sqlContext.createDataFrame(data[i:i+100]).rdd]
#ssc.sparkContext.parallelize(rddQueue, 2)
len(rddQueue)

100

In [9]:
def predict(row):
    ordered_list = []
    for i in ["is_maried", "age", "work_experience", "children_number", "has_mortgage",
                "loan_products_nii", "checking_account_balance", "deposit_balance", "money_market_balance",
                "investable_asset_indicator", "Sentiment", "Sentiment_days_passed"]:
        ordered_list.append(row[i])
    res = rfc.predict_proba(np.array(ordered_list).reshape(1,-1))
    return np.append(res, row["id"])
    #return (row["id"], res)

In [10]:
# Create the QueueInputDStream and use it do some processing
inputStream = ssc.queueStream(rddQueue)

In [11]:
mappedStream = inputStream.map(lambda x: predict(x))
#reducedStream = mappedStream.reduceByKey(lambda a, b: a + b)
#reducedStream.pprint()
mappedStream.pprint()
#mappedStream.saveAsTextFiles("hdfs:///tmp/prediction.txt")
ssc.start()
time.sleep(10)

-------------------------------------------
Time: 2016-03-11 10:48:33
-------------------------------------------
[ 0.  1.  0.  0.  0.  1.]
[ 1.  0.  0.  0.  0.  2.]
[ 0.          0.52380952  0.47619048  0.          0.          3.        ]
[ 0.14285714  0.42857143  0.          0.42857143  0.          4.        ]
[ 0.          0.95238095  0.          0.          0.04761905  5.        ]
[ 0.          0.04761905  0.85714286  0.0952381   0.          6.        ]
[ 0.19047619  0.52380952  0.04761905  0.04761905  0.19047619  7.        ]
[ 0.          0.95238095  0.          0.04761905  0.          8.        ]
[ 0.          0.42857143  0.14285714  0.38095238  0.04761905  9.        ]
[  0.04761905   0.23809524   0.04761905   0.52380952   0.14285714  10.        ]
...

-------------------------------------------
Time: 2016-03-11 10:48:34
-------------------------------------------
[   0.23809524    0.19047619    0.            0.57142857    0.          101.        ]
[   0.            0.61904762   

In [12]:
ssc.stop(stopSparkContext=False, stopGraceFully=True)

-------------------------------------------
Time: 2016-03-11 10:48:42
-------------------------------------------
[  2.85714286e-01   2.85714286e-01   4.76190476e-02   0.00000000e+00
   3.80952381e-01   9.01000000e+02]
[  0.00000000e+00   9.52380952e-01   0.00000000e+00   4.76190476e-02
   0.00000000e+00   9.02000000e+02]
[  8.57142857e-01   0.00000000e+00   0.00000000e+00   1.42857143e-01
   0.00000000e+00   9.03000000e+02]
[  4.76190476e-02   0.00000000e+00   9.52380952e-01   0.00000000e+00
   0.00000000e+00   9.04000000e+02]
[   1.    0.    0.    0.    0.  905.]
[  7.61904762e-01   1.42857143e-01   4.76190476e-02   4.76190476e-02
   0.00000000e+00   9.06000000e+02]
[  0.00000000e+00   8.09523810e-01   9.52380952e-02   9.52380952e-02
   0.00000000e+00   9.07000000e+02]
[  1.90476190e-01   4.76190476e-02   4.76190476e-01   2.85714286e-01
   0.00000000e+00   9.08000000e+02]
[  0.00000000e+00   9.52380952e-01   0.00000000e+00   4.76190476e-02
   0.00000000e+00   9.09000000e+02]
[  9.523