In [6]:
!pip install --upgrade pandas-gbq google-cloud-bigquery




In [7]:
from google.colab import auth
auth.authenticate_user()


In [8]:
import pandas as pd
from google.cloud import bigquery

project_id = "bigdatabowl2025"
dataset_id = "bdb_tables"

client = bigquery.Client(project=project_id)

# List of table names
week_tables = [
    f"{dataset_id}.aggregated_week1_df",
    f"{dataset_id}.aggregated_week2_df",
    f"{dataset_id}.aggregated_week3_df",
    f"{dataset_id}.aggregated_week4_df",
    f"{dataset_id}.aggregated_week5_df",
    f"{dataset_id}.aggregated_week6_df",
    f"{dataset_id}.aggregated_week7_df",
    f"{dataset_id}.aggregated_week8_df",
    f"{dataset_id}.aggregated_week9_df",
]

# Load each table into a DataFrame and add a 'week' column
dataframes = []
for i, table in enumerate(week_tables, start=1):
    query = f"SELECT * FROM `{project_id}.{table}`"
    df = client.query(query).to_dataframe()
    df['week'] = f"week_{i}"  # Add 'week' column
    dataframes.append(df)

# Concatenate all DataFrames
final_df = pd.concat(dataframes, ignore_index=True)

# Verify the final DataFrame
print(final_df.head())
print(f"Final DataFrame shape: {final_df.shape}")

       gameId  playId  target_rush   qb_depth  rb_depth  fb_depth  \
0  2022091100    4068            0  14.990002  9.990002       0.0   
1  2022091100    4131            0   1.129998  6.469998       0.0   
2  2022091101    4128            0   0.949999  4.139999       0.0   
3  2022091101    4174            0   1.040001  4.080001       0.0   
4  2022091103    5120            0   1.069999  3.699999       0.0   

   absoluteYardlineNumber  preSnapHomeTeamWinProbability  \
0                      70                       0.434159   
1                      77                       0.439419   
2                      59                       0.462624   
3                      50                       0.465353   
4                      75                       0.330262   

   preSnapVisitorTeamWinProbability offenseFormation  ... TE_snap_motion  \
0                          0.565841               NA  ...              0   
1                          0.560581               NA  ...              0

In [9]:
# Columns to analyze
columns_to_check = [
    "offenseFormation",
    "receiverAlignment",
    "pff_passCoverage",
    "pff_manZone",
    "target_rush",
    "TE_snap_motion",
    "RB_motion",
    "WR_motion",
    "FB_motion",
    "TE_motion",
    "RB_shift",
    "WR_shift",
    "FB_shift",
    "TE_shift"
]

# Display value counts for each column
for column in columns_to_check:
    print(f"Value counts for {column}:")
    print(final_df[column].value_counts(dropna=False))  # Include NaN values in counts
    print("\n")


Value counts for offenseFormation:
offenseFormation
SHOTGUN       8695
SINGLEBACK    3888
EMPTY         1321
I_FORM        1018
PISTOL         633
NA             187
JUMBO          124
WILDCAT         37
Name: count, dtype: int64


Value counts for receiverAlignment:
receiverAlignment
2x2    6411
3x1    5975
2x1    1787
3x2    1206
NA      187
1x1     154
4x1     121
2x0      44
3x0       9
1x0       8
3x3       1
Name: count, dtype: int64


Value counts for pff_passCoverage:
pff_passCoverage
Cover-3                 4895
Cover-1                 3263
Quarters                2035
Cover-2                 1827
Cover 6-Left             684
Cover-6 Right            680
Cover-3 Seam             631
Cover-0                  588
Red Zone                 527
NA                       191
2-Man                    185
Goal Line                141
Bracket                   75
Cover-1 Double            53
Prevent                   46
Cover-3 Cloud Right       30
Cover-3 Cloud Left        30
Miscellan

In [10]:
from pandas_gbq import to_gbq

# Define output table name
output_table = f"{dataset_id}.final_aggregated_data"

# Save final_df to BigQuery
to_gbq(final_df, output_table, project_id=project_id, if_exists="replace")

print(f"Final DataFrame saved to BigQuery table {output_table}")


100%|██████████| 1/1 [00:00<00:00, 801.82it/s]

Final DataFrame saved to BigQuery table bdb_tables.final_aggregated_data





In [11]:
# Save final_df to a CSV file
file_name = "final_bdb_aggregated_data.csv"
final_df.to_csv(file_name, index=False)

# Download the file
from google.colab import files
files.download(file_name)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>