# Data Gathering and Pre-processing

Import the required libraries...

In [103]:
import csv
import pickle
import os
import pandas as pd
import numpy as np
import nfl_data_py as nfl

## Gathering

NFL play by play data found [here](https://github.com/nflverse/nflverse-data/releases/tag/pbp). In this cell we will convert the csv files to pkl files, so they're easier to work with. This part may take a while, as the csv files are pretty big.

In [104]:
all_seasons = []
# loop through the play by play csv files
for i in range(1999, 2023):
    # open the csv file for the year (i is the year)
    with open(f'./pbp_csv/play_by_play_{i}.csv', 'r') as f:
        reader = csv.reader(f)

        all_games = []
        game = []
        # starting_row first row of the csv
        starting_row = next(reader)
        # loop through the rows of the csv
        for row in reader:
            # skip the first row
            if row[0] == 'play_id':
                continue
            # skip
            elif row[27] == 'GAME':
                continue
            # add the row to the game's dataframe, until the 'desc' column contains END GAME
            # desc is in column AB, which is index 27
            elif row[27] == 'END GAME':
                # add game to the list of dataframes and clear the game dataframe
                game = pd.DataFrame(game)
                game.columns = starting_row
                all_games.append(game)
                game = []
            else:
                # add the row to the game list
                game.append(row)
        
        all_seasons.append(all_games)

In [105]:
# loop through every game and make them dataframes
for season in all_seasons:
    for game in season:
        # target_vals = []
        target_vals = pd.DataFrame()
        target_vals['fixed_drive'] = game['fixed_drive']
        target_vals['fixed_drive_result'] = game['fixed_drive_result']
        target_vals['drive_time_of_possession'] = game['drive_time_of_possession']
        target_vals['drive_play_count'] = game['drive_play_count']
        target_vals['drive_first_downs'] = game['drive_first_downs']
        target_vals['drive_inside20'] = game['drive_inside20']
        target_vals['drive_ended_with_score'] = game['drive_ended_with_score']
        target_vals['drive_game_clock_end'] = game['drive_game_clock_end']
        target_vals['drive_end_yard_line'] = game['drive_end_yard_line']
        # print(target_vals)
        # drop the columns that are not needed
        game.drop(columns=[
            'game_id', 
            'old_game_id', 
            'desc', 
            'play_id',
            'game_date',
            'ydsnet',
            'no_score_prob',
            'opp_fg_prob',
            'opp_safety_prob',
            'opp_td_prob',
            'fg_prob',
            'safety_prob',
            'td_prob',
            'extra_point_prob',
            'two_point_conversion_prob',
            'ep',
            'epa',
            'total_home_epa',
            'total_away_epa',
            'total_home_rush_epa',
            'total_away_rush_epa',
            'total_home_pass_epa',
            'total_away_pass_epa',
            'air_epa',
            'yac_epa',
            'comp_air_epa',
            'comp_yac_epa',
            'total_home_comp_air_epa',
            'total_away_comp_air_epa',
            'total_home_comp_yac_epa',
            'total_away_comp_yac_epa',
            'total_home_raw_air_epa',
            'total_away_raw_air_epa',
            'total_home_raw_yac_epa',
            'total_away_raw_yac_epa',
            'wp',
            'def_wp',
            'home_wp',
            'away_wp',
            'wpa',
            'vegas_wpa',
            'vegas_home_wpa',
            'home_wp_post',
            'away_wp_post',
            'vegas_wp',
            'vegas_home_wp',
            'total_home_rush_wpa',
            'total_away_rush_wpa',
            'total_home_pass_wpa',
            'total_away_pass_wpa',
            'air_wpa',
            'yac_wpa',
            'comp_air_wpa',
            'comp_yac_wpa',
            'total_home_comp_air_wpa',
            'total_away_comp_air_wpa',
            'total_home_comp_yac_wpa',
            'total_away_comp_yac_wpa',
            'total_home_raw_air_wpa',
            'total_away_raw_air_wpa',
            'total_home_raw_yac_wpa',
            'total_away_raw_yac_wpa',
            'passer_player_id',
            'passer_player_name',
            'receiver_player_id',
            'receiver_player_name',
            'rusher_player_id',
            'rusher_player_name',
            'lateral_receiver_player_id',
            'lateral_receiver_player_name',
            'lateral_rusher_player_id',
            'lateral_rusher_player_name',
            'lateral_sack_player_id',
            'lateral_sack_player_name',
            'interception_player_id',
            'interception_player_name',
            'lateral_interception_player_id',
            'lateral_interception_player_name',
            'punt_returner_player_id',
            'punt_returner_player_name',
            'lateral_punt_returner_player_id',
            'lateral_punt_returner_player_name',
            'kickoff_returner_player_name',
            'kickoff_returner_player_id',
            'lateral_kickoff_returner_player_id',
            'lateral_kickoff_returner_player_name',
            'punter_player_id',
            'punter_player_name',
            'kicker_player_name',
            'kicker_player_id',
            'own_kickoff_recovery_player_id',
            'own_kickoff_recovery_player_name',
            'blocked_player_id',
            'blocked_player_name',
            'tackle_for_loss_1_player_id',
            'tackle_for_loss_1_player_name',
            'tackle_for_loss_2_player_id',
            'tackle_for_loss_2_player_name',
            'qb_hit_1_player_id',
            'qb_hit_1_player_name',
            'qb_hit_2_player_id',
            'qb_hit_2_player_name',
            'forced_fumble_player_1_player_id',
            'forced_fumble_player_1_player_name',
            'forced_fumble_player_2_player_id',
            'forced_fumble_player_2_player_name',
            'solo_tackle_1_player_id',
            'solo_tackle_1_player_name',
            'solo_tackle_2_player_id',
            'solo_tackle_2_player_name',
            'assist_tackle_1_player_id',
            'assist_tackle_1_player_name',
            'assist_tackle_2_player_id',
            'assist_tackle_2_player_name',
            'assist_tackle_3_player_id',
            'assist_tackle_3_player_name',
            'assist_tackle_4_player_id',
            'assist_tackle_4_player_name',
            'tackle_with_assist_1_player_id',
            'tackle_with_assist_1_player_name',
            'tackle_with_assist_2_player_id',
            'tackle_with_assist_2_player_name',
            'pass_defense_1_player_id',
            'pass_defense_1_player_name',
            'pass_defense_2_player_id',
            'pass_defense_2_player_name',
            'fumbled_1_player_id',
            'fumbled_1_player_name',
            'fumbled_2_player_id',
            'fumbled_2_player_name',
            'fumble_recovery_1_player_id',
            'fumble_recovery_1_player_name',
            'fumble_recovery_2_player_id',
            'fumble_recovery_2_player_name',
            'sack_player_id',
            'sack_player_name',
            'half_sack_1_player_id',
            'half_sack_1_player_name',
            'half_sack_2_player_id',
            'half_sack_2_player_name',
            'penalty_player_id',
            'penalty_player_name',
            'drive_quarter_end',
            'fixed_drive',
            'fixed_drive_result',
            'drive_time_of_possession',
            'drive_play_count',
            'drive_first_downs',
            'drive_inside20',
            'drive_ended_with_score',
            'drive_game_clock_end',
            'drive_end_yard_line',
            'drive_play_id_started',
            'drive_play_id_ended',
            'away_score',
            'home_score',
            'result',
            'total',
            'total_line',
            'spread_line',
            'passer',
            'passer_jersey_number',
            'rusher',
            'rusher_jersey_number',
            'receiver',
            'receiver_jersey_number',
            'passer_id',
            'rusher_id',
            'receiver_id',
            'name',
            'jersey_number',
            'id',
            'fantasy_player_id',
            'fantasy_player_name',
            'fantasy',
            'fantasy_id',
            'qb_epa',
            'xyac_epa',
            'pass_oe'
            ], inplace=True)

Next, gather data on team ratings by season from 

In [106]:
# save the last season to a txt file to check
with open('2022_df.txt', 'w') as f:
    for game in all_seasons[-1]:
        # for play in game:
        f.write(str(game) + '\n')
        # f.write('\n')

In [107]:
# start the training
X = []
for game in all_seasons:
    X.append(game)

Y = []
