In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f


spark = SparkSession.builder.appName('SparkByExamples.com')\
    .config('spark.driver.bindAddress','localhost')\
    .config("spark.ui.port","4051")\
    .config("spark.driver.memory","5g")\
    .getOrCreate()

23/02/18 00:24:05 WARN Utils: Your hostname, Chaturvedi_PC resolves to a loopback address: 127.0.1.1; using 172.28.61.13 instead (on interface eth0)
23/02/18 00:24:05 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
23/02/18 00:24:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
test_path = "/home/abhay/work/dream11/model_data/model_oututs/test"
train_path = "/home/abhay/work/dream11/model_data/model_oututs/train"

In [6]:
test_df = spark.read.parquet(test_path)
test_df.registerTempTable("test_preds")

                                                                                

In [27]:
rank_comparison_df = spark.sql("""
with actual_ranks as (
    select player_id, match_id, 
        fantasy_points, rank() over(partition by match_id order by fantasy_points desc) as actual_rank
    from test_preds
)
, predicted_ranks as  (
    select player_id, match_id, 
         prediction, rank() over(partition by match_id order by prediction desc) as predicted_rank
    from test_preds
)
select actual_ranks.player_id, actual_ranks.match_id, actual_ranks.fantasy_points, actual_ranks.actual_rank, 
    predicted_ranks.prediction, predicted_ranks.predicted_rank
from actual_ranks as actual_ranks 
inner join predicted_ranks as predicted_ranks
where actual_ranks.player_id = predicted_ranks.player_id 
and actual_ranks.match_id = predicted_ranks.match_id 
""")
rank_comparison_df.registerTempTable("compared_ranks")

In [28]:
# preview joined data
spark.sql("""
    select * from compared_ranks order by match_id, player_id, actual_rank
""").limit(100).toPandas()

                                                                                

Unnamed: 0,player_id,match_id,fantasy_points,actual_rank,prediction,predicted_rank
0,12b610c2,1001349,37,7,38.492119,7
1,14f96089,1001349,54,5,45.702792,6
2,32198ae0,1001349,23,11,23.892199,11
3,469ea22b,1001349,29,8,31.392281,8
4,5748e866,1001349,18,14,9.596771,14
...,...,...,...,...,...,...
95,1e182ae2,1003847,29,10,31.165446,7
96,27ca3a11,1003847,36,5,30.346233,10
97,2f873ad6,1003847,3,16,3.641405,15
98,42172c3c,1003847,102,1,81.368358,1


In [31]:
# compare probability of model giving correct top 11 players
spark.sql("""
    select
        (select count(*) from compared_ranks where actual_rank <12 and predicted_rank < 12)/(select count(*) from compared_ranks where actual_rank <12)
""").toPandas()

                                                                                ]

Unnamed: 0,(CAST(scalarsubquery() AS DOUBLE) / CAST(scalarsubquery() AS DOUBLE))
0,0.878581


In [32]:
# compare probability of model giving correct top 2 players, helpful in choosing captain and vice captain
spark.sql("""
    select
        (select count(*) from compared_ranks where actual_rank <3 and predicted_rank < 3)/(select count(*) from compared_ranks where actual_rank <3)
""").toPandas()

                                                                                ]

Unnamed: 0,(CAST(scalarsubquery() AS DOUBLE) / CAST(scalarsubquery() AS DOUBLE))
0,0.579978


In [34]:
spark.sql("""
with gain_matrix as (
    select
        match_id, player_id, 
        fantasy_points/log2(actual_rank + 1) as ideal_discounted_gain, 
        prediction/log2(predicted_rank + 1) as discounted_gain from compared_ranks
)
select sum(discounted_gain)/sum(ideal_discounted_gain) as ndcg from gain_matrix
""").toPandas()

                                                                                

Unnamed: 0,ndcg
0,0.923678
