-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathnba_stats_scrape2.py
83 lines (56 loc) · 2.18 KB
/
nba_stats_scrape2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import pandas as pd
import numpy as np
from nba_api.stats.endpoints import leaguegamelog
from nba_api.stats.endpoints import boxscoretraditionalv2
from nba_api.stats.endpoints import boxscoreadvancedv2
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
import sys
start_time = time.time()
year = sys.argv[1]
fails_name = 'fails' + year + '.csv'
result_name = 'result' + year + '.csv'
start_year = 2000 + int(year)
end_year = 2000 + int(year)
print('Processing year ', start_year)
games = []
for season_id in range(start_year,end_year+1):
seasongames = leaguegamelog.LeagueGameLog(season=str(season_id), timeout=15).get_data_frames()[0]
games += list(seasongames['GAME_ID'].unique())
print('Successfully got game ids!')
game_count = len(games)
print('Game count ', game_count)
df_games = pd.DataFrame(games, columns=['Games'])
df_games.to_csv('games.csv', index=False)
# games = games[0:101]
max_attempts = 3
failed_games = []
count = 0
def addGame(game_id, attempts=0):
try:
data_frame = boxscoretraditionalv2.BoxScoreTraditionalV2(game_id,timeout=10).get_data_frames()[0]
global count
count += 1
if count % 100 == 0:
print('Successes ', count, ' out of ', game_count)
return data_frame
except Exception as e:
if attempts >= max_attempts - 1:
print("Couldn't add game ", game_id)
failed_games.append(game_id)
return None
time.sleep(2 ** attempts)
return addGame(game_id, attempts+1)
df = pd.DataFrame()
with ThreadPoolExecutor(max_workers=16) as executor:
print('Max workers = ', executor._max_workers)
these_futures = [executor.submit(addGame, game) for game in games]
for future in as_completed(these_futures):
result = future.result()
if result is not None:
df = pd.concat([df, result], ignore_index=True)
print('Finished processing')
df.to_csv(result_name, index=False)
df_fails = pd.DataFrame(failed_games, columns=['Failed Games'])
df_fails.to_csv(fails_name, index=False)
print("--- %s seconds ---" % (time.time() - start_time))