In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.context import SparkContext

In [2]:
spark = SparkSession.builder \
    .appName('test-bq') \
    .getOrCreate()

24/03/28 16:52:30 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [4]:
spark

In [9]:
box_df = spark.read.format('bigquery') \
  .option('table', 'dez-nba-analytics.nba_database.player_boxscore_par_cl') \
  .load()

box_df.createOrReplaceTempView('box_df')



In [10]:
box_df.columns

['game_id',
 'game_date',
 'season_type',
 'player_id',
 'player',
 'team',
 'home',
 'away',
 'mins_played',
 'PTS',
 'field_goal_made',
 'field_goal_attempt',
 'field_goal_pct',
 'three_pt_made',
 'three_pt_attempt',
 'three_pt_pct',
 'free_throw_made',
 'free_throw_attempt',
 'free_throw_pct',
 'OREB',
 'DREB',
 'REB',
 'AST',
 'STL',
 'BLK',
 'TOV',
 'PF',
 'plusminus',
 'win',
 'season_id']

In [11]:
season_stats = spark.sql("""
SELECT
        -- Basic info about the players
        season_id,
        player_id,
        player AS player_name,
        
        -- Summary of statistics
        SUM(PTS) AS total_points_scored,
        SUM(field_goal_made) AS total_fg_made,
        SUM(field_goal_attempt) AS total_fg_attempts,
        SUM(field_goal_made)/SUM(field_goal_attempt) AS season_shots_percentage,
        SUM(three_pt_made) AS total_three_pt_made,
        SUM(three_pt_attempt) AS total_three_pt_attempts,
        SUM(three_pt_made)/SUM(three_pt_attempt) AS season_three_pt_percentage,
        SUM(free_throw_made) AS total_free_throw_made,
        SUM(free_throw_attempt) AS total_free_throw_attempts,
        SUM(free_throw_made)/SUM(free_throw_attempt) AS season_freethrow_percentage,
        SUM(REB) AS total_rebounds,
        SUM(AST) AS total_assists,
        SUM(STL) AS total_steals,
        SUM(TOV) AS total_turnovers,
        
        -- General averages
        AVG(PTS) AS avg_points_scored_per_match,
        AVG(field_goal_made) AS avg_shots_per_match,
        AVG(three_pt_made) AS avg_three_pt_shots_per_match
    FROM
        box_df
    GROUP BY
        1, 2, 3        
""")

In [16]:
season_stats.write \
    .format('bigquery') \
    .option('temporaryGcsBucket', 'dataproc-temp-us-central1-385360674362-ioatwhvx') \
    .save('dez-nba-analytics.nba_database.season_player_summary')

                                                                                