# Process

## Step 1: Get Player Stats from cfbFastR

In [None]:
# Libraries
library(cfbfastR)
library(dplyr)
library(readr)
library(dplyr)
library(readr)
library(stringr)

# Set API key for cfbfastR
Sys.setenv(CFBD_API_KEY = "Scram!")

In [7]:
start_year <- 2005
end_year <- 2005
player_stats_file <- "filtered_player_stats_full_2005.csv"

for (year in start_year:end_year) {
  cat("Processing data for year:", year, "\n")
  
  player_stats <- cfbd_stats_season_player(year = year) %>%
    filter(!is.na(player), player != "Team") %>%
    mutate(year = as.character(year)) %>%
    distinct()
  
  if (file.exists(player_stats_file)) {
    existing_data <- read_csv(player_stats_file) %>%
      mutate(across(everything(), as.character))
    
    player_stats <- player_stats %>%
      mutate(across(everything(), as.character))
    
    combined_data <- bind_rows(existing_data, player_stats) %>%
      distinct()
  } else {
    combined_data <- player_stats
  }
  
  write_csv(combined_data, player_stats_file)
  cat("Data for year", year, "appended to", player_stats_file, "\n")
}

cat("Batch processing complete. All player stats saved to", player_stats_file, "\n")


Processing data for year: 2005 
Data for year 2005 appended to filtered_player_stats_full_2005.csv 
Batch processing complete. All player stats saved to filtered_player_stats_full_2005.csv 


# Step 2: Bring in Combine Data and add athlete_id to it

In [214]:
combine_data <- read_csv("C:/Users/RaymondCarpenter/Documents/GitHub/nfl-draft-data/data/combine_data.csv")
player_stats <- read_csv("filtered_player_stats_full.csv")

# Step 1: Strip dots (.) from names in both datasets
combine_data <- combine_data %>%
  mutate(Name = str_replace_all(Name, "\\.", ""))  # Remove periods from Name

player_stats <- player_stats %>%
  mutate(player = str_replace_all(player, "\\.", ""))  # Remove periods from player

# Step 2: Flag potential duplicate names in player_stats
duplicate_names <- player_stats %>%
  group_by(player) %>%
  filter(n_distinct(athlete_id) > 1) %>%  # Identify names with multiple athlete_ids (ex AJ Green)
  pull(player) %>% 
  unique()

# Step 3: Filter out duplicate names from player_stats to prevent incorrect joins
filtered_player_stats <- player_stats %>%
  filter(!player %in% duplicate_names)

# Step 4: Join to map `athlete_id` to `combine_data` based on Name
updated_combine_data <- combine_data %>%
  left_join(
    filtered_player_stats %>% select(player, athlete_id), 
    by = c("Name" = "player")  # Match on Name
  ) %>%
  mutate(
    athlete_id = coalesce(athlete_id, Id)
  ) %>%
  select(-Id)

# Step 5: Save rows with unresolved athlete_id for manual review
unresolved_rows <- updated_combine_data %>%
  filter(is.na(athlete_id))

write_csv(unresolved_rows, "unresolved_combine_data.csv")
cat("Unresolved rows saved to unresolved_combine_data.csv for manual review.\n")

# Step 6: Save the updated combine data to a new CSV
write_csv(updated_combine_data, "combine_data_with_athlete_id_step1.csv")
cat("Updated combine data saved to combine_data_with_athlete_id_step1.csv\n")

[1mRows: [22m[34m7434[39m [1mColumns: [22m[34m17[39m
[36m--[39m [1mColumn specification[22m [36m--------------------------------------------------------[39m
[1mDelimiter:[22m ","
[31mchr[39m  (3): Name, College, POS
[32mdbl[39m (12): Year, Height (in), Weight (lbs), Hand Size (in), Arm Length (in), ...
[33mlgl[39m  (2): Id, 60Yd Shuttle

[36mi[39m Use `spec()` to retrieve the full column specification for this data.
[36mi[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m102813[39m [1mColumns: [22m[34m59[39m
[36m--[39m [1mColumn specification[22m [36m--------------------------------------------------------[39m
[1mDelimiter:[22m ","
[31mchr[39m  (3): team, conference, player
[32mdbl[39m (56): year, athlete_id, passing_completions, passing_att, passing_pct, p...

[36mi[39m Use `spec()` to retrieve the full column specification for this data.
[36mi[39m Specify the column types or set `show

Unresolved rows saved to unresolved_combine_data.csv for manual review.
Updated combine data saved to combine_data_with_athlete_id_step1.csv


In [215]:
# Load the Step 1 CSV
combine_data_step1 <- read_csv("combine_data_with_athlete_id_step1.csv")

# Filter the data to one row per athlete_id, except where athlete_id is NA
filtered_combine_data <- combine_data_step1 %>%
  group_by(athlete_id) %>%
  filter(is.na(athlete_id) | row_number() == 1) %>%  # Keep all NA rows; otherwise, keep the first row per athlete_id
  ungroup()

# Save the filtered data to a new CSV
write_csv(filtered_combine_data, "combine_data_unique_athlete_id_step2.csv")
cat("Filtered combine data saved to combine_data_unique_athlete_id_step2.csv\n")

[1mRows: [22m[34m15876[39m [1mColumns: [22m[34m17[39m
[36m--[39m [1mColumn specification[22m [36m--------------------------------------------------------[39m
[1mDelimiter:[22m ","
[31mchr[39m  (3): Name, College, POS
[32mdbl[39m (13): Year, Height (in), Weight (lbs), Hand Size (in), Arm Length (in), ...
[33mlgl[39m  (1): 60Yd Shuttle

[36mi[39m Use `spec()` to retrieve the full column specification for this data.
[36mi[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


Filtered combine data saved to combine_data_unique_athlete_id_step2.csv


In [216]:
# Load the datasets
combine_data <- read_csv("combine_data_unique_athlete_id_step2.csv")
player_stats <- read_csv("filtered_player_stats_full.csv")

# Step 1: Standardize and clean the names
combine_data <- combine_data %>%
  mutate(
    Name = str_replace_all(Name, "\\.", ""),  # Remove periods again from Name
    College = tolower(College)               # Standardize case for College
  )

player_stats <- player_stats %>%
  mutate(
    player = str_replace_all(player, "\\.", ""),  # Remove dots from player
    team = tolower(team)                         # Standardize case for team
  )

# Step 2: Perform the join and coalesce athlete_id
updated_combine_data <- combine_data %>%
  left_join(
    player_stats %>% select(player, team, athlete_id),  # Select relevant columns for matching
    by = c("Name" = "player", "College" = "team")       # Match on Name and College
  ) %>%
  mutate(
    athlete_id = coalesce(athlete_id.x, athlete_id.y),  # Resolve athlete_id.x and athlete_id.y
    athlete_id.x = NULL,                               # Drop unnecessary columns
    athlete_id.y = NULL
  )

# Debugging: Check the structure and content of the join result
cat("Structure of updated_combine_data after join and coalesce:\n")
str(updated_combine_data)

# Debugging: Count unresolved athlete_ids
unresolved_count <- updated_combine_data %>%
  filter(is.na(athlete_id)) %>%
  nrow()

cat("Number of rows with unresolved athlete_id:", unresolved_count, "\n")

# Step 3: Save rows with unresolved athlete_id for manual review
unresolved_rows <- updated_combine_data %>%
  filter(is.na(athlete_id))

write_csv(unresolved_rows, "unresolved_name_and_school.csv")
cat("Unresolved rows saved to unresolved_name_and_school.csv for manual review.\n")

# Step 4: Save the updated dataset
write_csv(updated_combine_data, "name_and_school_check_step3.csv")
cat("Updated data saved to name_and_school_check_step3.csv\n")


[1mRows: [22m[34m7407[39m [1mColumns: [22m[34m17[39m
[36m--[39m [1mColumn specification[22m [36m--------------------------------------------------------[39m
[1mDelimiter:[22m ","
[31mchr[39m  (3): Name, College, POS
[32mdbl[39m (13): Year, Height (in), Weight (lbs), Hand Size (in), Arm Length (in), ...
[33mlgl[39m  (1): 60Yd Shuttle

[36mi[39m Use `spec()` to retrieve the full column specification for this data.
[36mi[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m102813[39m [1mColumns: [22m[34m59[39m
[36m--[39m [1mColumn specification[22m [36m--------------------------------------------------------[39m
[1mDelimiter:[22m ","
[31mchr[39m  (3): team, conference, player
[32mdbl[39m (56): year, athlete_id, passing_completions, passing_att, passing_pct, p...

[36mi[39m Use `spec()` to retrieve the full column specification for this data.
[36mi[39m Specify the column types or set `show_col

Structure of updated_combine_data after join and coalesce:
tibble [14,279 x 17] (S3: tbl_df/tbl/data.frame)
 $ Year           : num [1:14279] 2024 2024 2024 2024 2024 ...
 $ Name           : chr [1:14279] "Kris Abrams-Draine" "Kris Abrams-Draine" "Kris Abrams-Draine" "Kris Abrams-Draine" ...
 $ College        : chr [1:14279] "missouri" "missouri" "missouri" "missouri" ...
 $ POS            : chr [1:14279] "CB" "CB" "CB" "CB" ...
 $ Height (in)    : num [1:14279] 71.4 71.4 71.4 71.4 76.2 ...
 $ Weight (lbs)   : num [1:14279] 179 179 179 179 315 206 252 235 235 235 ...
 $ Hand Size (in) : num [1:14279] 8.63 8.63 8.63 8.63 9 ...
 $ Arm Length (in): num [1:14279] 31 31 31 31 33.9 ...
 $ Wonderlic      : num [1:14279] NA NA NA NA NA NA NA NA NA NA ...
 $ 40 Yard        : num [1:14279] 4.44 4.44 4.44 4.44 5.22 NA NA NA NA NA ...
 $ Bench Press    : num [1:14279] NA NA NA NA 22 NA NA 26 26 26 ...
 $ Vert Leap (in) : num [1:14279] 33.5 33.5 33.5 33.5 24.5 NA NA 32 32 32 ...
 $ Broad Jump (in):

In [218]:
combine_data <- read_csv("name_and_school_check_step3.csv")

# Remove duplicates based on `athlete_id`, keeping all `NA` rows
unique_combine_data <- combine_data %>%
  filter(is.na(athlete_id) | !duplicated(athlete_id))  # Keep all NA and first occurrence of non-NA athlete_id

# Debugging: Count the rows before and after filtering
cat("Rows before filtering duplicates:", nrow(combine_data), "\n")
cat("Rows after filtering duplicates:", nrow(unique_combine_data), "\n")

# Save the updated dataset
write_csv(unique_combine_data, "combine_data_unique_athlete_id_step4.csv")
cat("Updated dataset saved to combine_data_unique_athlete_id_step4.csv\n")

[1mRows: [22m[34m14279[39m [1mColumns: [22m[34m17[39m
[36m--[39m [1mColumn specification[22m [36m--------------------------------------------------------[39m
[1mDelimiter:[22m ","
[31mchr[39m  (3): Name, College, POS
[32mdbl[39m (13): Year, Height (in), Weight (lbs), Hand Size (in), Arm Length (in), ...
[33mlgl[39m  (1): 60Yd Shuttle

[36mi[39m Use `spec()` to retrieve the full column specification for this data.
[36mi[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


Rows before filtering duplicates: 14279 
Rows after filtering duplicates: 7430 
Updated dataset saved to combine_data_unique_athlete_id_step4.csv


In [219]:
combine_data <- read_csv("combine_data_unique_athlete_id_step4.csv")

# Filter rows where athlete_id is NA
na_athlete_ids <- combine_data %>%
  filter(is.na(athlete_id))

# Debugging: Display the number of rows with NA athlete_id
cat("Number of rows with NA athlete_id:", nrow(na_athlete_ids), "\n")

# Save the rows with NA athlete_id to a separate CSV for review
write_csv(na_athlete_ids, "na_athlete_ids_inspection.csv")
cat("Rows with NA athlete_id saved to na_athlete_ids_inspection.csv\n")

# Print the first few rows to the console for quick review
print(head(na_athlete_ids))

[1mRows: [22m[34m7430[39m [1mColumns: [22m[34m17[39m
[36m--[39m [1mColumn specification[22m [36m--------------------------------------------------------[39m
[1mDelimiter:[22m ","
[31mchr[39m  (3): Name, College, POS
[32mdbl[39m (13): Year, Height (in), Weight (lbs), Hand Size (in), Arm Length (in), ...
[33mlgl[39m  (1): 60Yd Shuttle

[36mi[39m Use `spec()` to retrieve the full column specification for this data.
[36mi[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


Number of rows with NA athlete_id: 2461 
Rows with NA athlete_id saved to na_athlete_ids_inspection.csv
[90m# A tibble: 6 x 17[39m
   Year Name         College POS   `Height (in)` `Weight (lbs)` `Hand Size (in)`
  [3m[90m<dbl>[39m[23m [3m[90m<chr>[39m[23m        [3m[90m<chr>[39m[23m   [3m[90m<chr>[39m[23m         [3m[90m<dbl>[39m[23m          [3m[90m<dbl>[39m[23m            [3m[90m<dbl>[39m[23m
[90m1[39m  [4m2[24m024 Isaiah Adams illino~ OG             76.2            315             9   
[90m2[39m  [4m2[24m024 Kiran Amega~ yale    OT             77.4            323             9.63
[90m3[39m  [4m2[24m024 Gottlieb Ay~ maryla~ OG             76.1            308             9.88
[90m4[39m  [4m2[24m024 Karsen Barn~ michig~ OG             76.5            306             9.88
[90m5[39m  [4m2[24m024 Tanor Borto~ wiscon~ C              76.2            303            10   
[90m6[39m  [4m2[24m024 Millard Bra~ texas ~ FS             70.5     

### Manually Update a Few (Bucky Irving, Frank Gore Jr, etc.)