In [1]:
import numpy as np
import pandas as pd
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DoubleType

In [2]:
# This is the only thing we should have to change
# to get lists. For example: to create list for campaign starting
# in december 2017, we get features from october 2017:
year_for_graph_features = 2018
month_for_graph_features = 1

In [4]:
graph_table = (spark.read.table("tests_es.cdr_graph_features_no_communities_ads")
               .where(col("year")==year_for_graph_features)
               .where(col("month")==month_for_graph_features)
               .where(col("for_training")==False)
               )

In [5]:
spark.conf.set("spark.sql.shuffle.partitions", 256)

In [6]:
from pyspark.ml import PipelineModel

most_recent_model = PipelineModel.load("hdfs:///user/jsotovi2/captacion/2017_12_21__modelo_captacion.sparkmodel")

In [7]:
most_recent_model

PipelineModel_4c5092da55818b13742f

In [8]:
get_score_udf = udf(lambda x: x.tolist()[1], DoubleType())

predictions = (most_recent_model.transform(graph_table)
               .where(col("es_vodafone")=="no")
               .withColumn("raw_score", get_score_udf(col("probability")))
               .withColumnRenamed("id", "phone")
               .orderBy("raw_score", ascending=False)
              )

In [9]:
(predictions
 .select("phone", "raw_score")
 ).show()

+------------+--------------------+
|       phone|           raw_score|
+------------+--------------------+
|   973653474| 0.13950156433937558|
|   650761778|  0.0984621485909407|
| 35014195292| 0.08240872926947557|
|   673357489| 0.07597975865275887|
|   619850788| 0.06711827672474706|
| 35025123536| 0.06530321864550215|
|   682098409| 0.06105028209674771|
|   664115217|0.057608033632736165|
|   684268201|    0.05645249045128|
|   682151466|0.056178628412901395|
|   644637978|0.055715358415587245|
|   657132163|0.054779662122408763|
|   685260904|  0.0535736884682613|
|   647494026| 0.04907705452746219|
|   628875380| 0.04715951911512765|
|   627315998| 0.04711017090023518|
|   611756138|  0.0452279378103566|
|212795760386|0.045066851093954564|
|   921771816| 0.04322195986108093|
|   685559228| 0.04297645730518429|
+------------+--------------------+
only showing top 20 rows



In [10]:
(predictions
 .select("phone", "raw_score")
 .write
 .format("parquet")
 .saveAsTable("tests_es.predictions_captation_201803_notprepared") # Remember to change the name of the table 
                                                                   # with the predictions! Also, we need to 
                                                                   # deanonimize this.
 )