In [None]:
import pandas as pd
nba = pd.read_csv("data/nba_2017_br.csv")
nba.describe()

In [None]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
%matplotlib inline

In [None]:
attendance_df = pd.read_csv("../data/nba_2017_attendance.csv")
endorsement_df = pd.read_csv("../data/nba_2017_endorsements.csv")
valuations_df = pd.read_csv("../data/nba_2017_team_valuations.csv")
salary_df = pd.read_csv("../data/nba_2017_salary.csv")
pie_df = pd.read_csv("../data/nba_2017_pie.csv")
plus_minus_df = pd.read_csv("../data/nba_2017_real_plus_minus.csv")
br_stats_df = pd.read_csv("../data/nba_2017_br.csv")
elo_df = pd.read_csv("../data/nba_2017_elo.csv")

In [None]:
attendance_valuation_df =\
attendance_df.merge(valuations_df, how="inner", on="TEAM")
attendance_valuation_df.head()

In [None]:
from IPython.core.display import display, HTML 
display(HTML("<style>.\container{ width:100% !important; }</style>"));\sns.pairplot(attendance_valuation_df, hue="TEAM")

In [None]:
corr = attendance_valuation_df.corr()
sns.heatmap(corr,
xticklabels=corr.columns.values,
yticklabels=corr.columns.values)

In [None]:
plt.subplots(figsize=(20,15))
ax = plt.axes()
ax.set_title("NBA Team AVG Attendance vs\Valuation in Millions: 2016-2017 Season")
sns.heatmap(valuations,linewidths=.5, annot=True, fmt='g')

In [None]:
results = smf.ols('VALUE_MILLIONS ~TOTAL_MILLIONS',data=attendance_valuation_df).fit()
print(results.summary())

In [None]:
sns.residplot(y="VALUE_MILLIONS", x="TOTAL_MILLIONS",data=attendance_valuation_df)

In [None]:
import statsmodels
rmse = statsmodels.tools.eval_measures.rmse(
attendance_valuation_predictions_df["predicted"],attendance_valuation_predictions_df["VALUE_MILLIONS"])
rmse

In [None]:
attendance_valuation_predictions_df =attendance_valuation_df.copy()
attendance_valuation_predictions_df["predicted"] =results.predict()
sns.lmplot(x="predicted", y="VALUE_MILLIONS",data=attendance_valuation_predictions_df)

In [None]:
val_housing_win_df= pd.read_csv("../data/nba_2017_att_val_elo_win_housing.csv")
val_housing_win_df.columns

In [None]:
numerical_df = val_housing_win_df.loc[:,["TOTAL_ATTENDANCE_MILLIONS", "ELO", "VALUE_MILLIONS","MEDIAN_HOME_PRICE_COUNTY_MILLONS"]]
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
print(scaler.fit(numerical_df))
print(scaler.transform(numerical_df))
MinMaxScaler(copy=True, feature_range=(0, 1))

In [None]:
from sklearn.cluster import KMeans
k_means = KMeans(n_clusters=3)
kmeans = k_means.fit(scaler.transform(numerical_df))
val_housing_win_df['cluster'] = kmeans.labels_
val_housing_win_df.head()

In [None]:
val_housing_win_df.to_csv("../data/nba_2017_att_val_elo_win_housing_cluster.csv")

In [None]:
library("scatterplot3d",lib.loc="/Library/Frameworks/R.framework/\Versions/3.4/Resources/library")
team_cluster = read_csv("~/src/aibook/src/chapter7/data/nba_2017_att_val_elo_win_housing_cluster.csv",+ col_types = cols(X1 = col_skip()))

In [None]:
cluster_to_numeric = function(column){+ converted_column <- as.numeric(unlist(column))+ return(converted_column)+ }

In [None]:
"""
Example Route To Construct:
https://wikimedia.org/api/rest_v1/ +
metrics/pageviews/per-article/ +
en.wikipedia/all-access/user/ +
LeBron_James/daily/2015070100/2017070500 +
"""
import requests
import pandas as pd
import time
import wikipedia
BASE_URL =\
"https://wikimedia.org/api/rest_v1/\
metrics/pageviews/per-article/en.wikipedia/all-access/user"

In [None]:
def construct_url(handle, period, start, end):
"""Constructs a URL based on arguments
Should construct the following URL:
/LeBron_James/daily/2015070100/2017070500
"""
urls = [BASE_URL, handle, period, start, end]
constructed = str.join('/', urls)
return constructed
def query_wikipedia_pageviews(url):
res = requests.get(url)
return res.json()
def wikipedia_pageviews(handle, period, start, end):
"""Returns JSON"""
constructed_url = construct_url(handle, period, start,end)
pageviews = query_wikipedia_pageviews(url=constructed_url)
return pageviews

In [None]:
def create_wikipedia_df(handles):
"""Creates a Dataframe of Pageviews"""
pageviews = []
timestamps = []
names = []
wikipedia_handles = []
for name, handle in handles.items():
pageviews_record = wikipedia_2016(handle)
if pageviews_record is None:
continue
for record in pageviews_record['items']:
pageviews.append(record['views'])
timestamps.append(record['timestamp'])
names.append(name)
wikipedia_handles.append(handle)
data = {
"names": names,
"wikipedia_handles": wikipedia_handles,
"pageviews": pageviews,
"timestamps": timestamps
}
df = pd.DataFrame(data)
return df

In [None]:
def create_wikipedia_handle(raw_handle):
"""Takes a raw handle and converts it to a wikipedia handle"""
wikipedia_handle = raw_handle.replace(" ", "_")
return wikipedia_handle
def create_wikipedia_nba_handle(name):
"""Appends basketball to link"""
url = " ".join([name, "(basketball)"])
return url
def wikipedia_current_nba_roster():
"""Gets all links on wikipedia current roster page"""
links = {}
nba = wikipedia.page("List_of_current_NBA_team_rosters")
for link in nba.links:
links[link] = create_wikipedia_handle(link)
return links

In [None]:
def guess_wikipedia_nba_handle(data="data/nba_2017_br.csv"):
"""Attempt to get the correct wikipedia handle"""
links = wikipedia_current_nba_roster()
nba = pd.read_csv(data)
count = 0
verified = {}
guesses = {}
for player in nba["Player"].values:
if player in links:
print("Player: {player}, Link: {link} ".\
format(player=player,
link=links[player]))
print(count)
count += 1
verified[player] = links[player] #add wikipedia link
else:
print("NO MATCH: {player}".format(player=player))
guesses[player] = create_wikipedia_handle(player)

In [None]:
def validate_wikipedia_guesses(guesses):
"""Validate guessed wikipedia accounts"""
verified = {}
wrong = {}
for name, link in guesses.items():
try:
page = wikipedia.page(link)
except (wikipedia.DisambiguationError,
wikipedia.PageError) as error:
#try basketball suffix
nba_handle = create_wikipedia_nba_handle(name)
try:
page = wikipedia.page(nba_handle)
print("Initial wikipedia URL Failed:\
{error}".format(error=error))
except (wikipedia.DisambiguationError,
wikipedia.PageError) as error:
print("Second Match Failure: {error}".\
format(error=error))
wrong[name] = link
continue
if "NBA" in page.summary:
verified[name] = link
else:
print("NO GUESS MATCH: {name}".format(name=name))
wrong[name] = link
return verified, wrong

In [None]:
def clean_wikipedia_handles(data="data/nba_2017_br.csv"):
"""Clean Handles"""
verified, guesses = guess_wikipedia_nba_handle(data=data)
verified_cleaned, wrong = validate_wikipedia_guesses(guesses)
print("WRONG Matches: {wrong}".format(wrong=wrong))
handles = {**verified, **verified_cleaned}
return handles
def nba_wikipedia_dataframe(data="data/nba_2017_br.csv"):
handles = clean_wikipedia_handles(data=data)
df = create_wikipedia_df(handles)
return df
def create_wikipedia_csv(data="data/nba_2017_br.csv"):
df = nba_wikipedia_dataframe(data=data)
df.to_csv("data/wikipedia_nba.csv")
if __name__ == "__main__":

In [None]:
import time
import twitter
from . import config
import pandas as pd
import numpy as np
from twitter.error import TwitterError

In [None]:
def api_handler():
"""Creates connection to Twitter API"""
api = twitter.Api(consumer_key=config.CONSUMER_KEY,
consumer_secret=config.CONSUMER_SECRET,
access_token_key=config.ACCESS_TOKEN_KEY,
access_token_secret=config.ACCESS_TOKEN_SECRET)
return api
def tweets_by_user(api, user, count=200):
"""Grabs the "n" number of tweets. Defaults to 200"""
tweets = api.GetUserTimeline(screen_name=user, count=count)
return tweets
def stats_to_df(tweets):
"""Takes twitter stats and converts them to a dataframe"""
records = []
for tweet in tweets:
records.append({"created_at":tweet.created_at,
"screen_name":tweet.user.screen_name,
"retweet_count":tweet.retweet_count,
"favorite_count":tweet.favorite_count})
df = pd.DataFrame(data=records)
return df
def stats_df(user):
"""Returns a dataframe of stats"""
api = api_handler()
tweets = tweets_by_user(api, user)
df = stats_to_df(tweets)
return df

In [None]:
df = stats_df(user="KingJames")
df.describe()

In [None]:
def twitter_handles(sleep=.5,data="data/twitter_nba_combined.csv"):
"""yield handles"""
nba = pd.read_csv(data)
for handle in nba["twitter_handle"]:
time.sleep(sleep) #Avoid throttling in twitter api
try:
df = stats_df(handle)
except TwitterError as error:
print("Error {handle} and error msg {error}".format(
handle=handle,error=error))
df = None
yield df
def median_engagement(data="data/twitter_nba_combined.csv"):
"""Median engagement on twitter"""
favorite_count = []
retweet_count = []
nba = pd.read_csv(data)
for record in twitter_handles(data=data):
print(record)
#None records stored as Nan value
if record is None:
print("NO RECORD: {record}".format(record=record))
favorite_count.append(np.nan)
retweet_count.append(np.nan)
continue
try:
favorite_count.append(record['favorite_count'].median())
retweet_count.append(record["retweet_count"].median())
except KeyError as error:
print("No values found to append {error}".\
format(error=error))
favorite_count.append(np.nan)
retweet_count.append(np.nan)
print("Creating DF")
nba['twitter_favorite_count'] = favorite_count
nba['twitter_retweet_count'] = retweet_count
return nba

In [None]:
def create_twitter_csv(data="data/nba_2016_2017_wikipedia.csv"):
nba = median_engagement(data)
nba.to_csv("data/nba_2016_2017_wikipedia_twitter.csv")

In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
color = sns.color_palette()
from IPython.core.display import display, HTML
display(HTML("<style>.container\
{ width:100% !important; }</style>"))
%matplotlib inline
<IPython.core.display.HTML object>

In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
color = sns.color_palette()
from IPython.core.display import display, HTML
display(HTML("<style>.container\{ width:100% !important; }</style>"))
%matplotlib inline
<IPython.core.display.HTML object>

In [None]:
nba_players_df = br_stats_df.copy()
nba_players_df.rename(
columns={'Player': 'PLAYER','Pos':'POSITION','Tm': "TEAM", 'Age': 'AGE', "PS/G": "POINTS"}, inplace=True)
nba_players_df.drop(["G", "GS", "TEAM"],inplace=True, axis=1)
nba_players_df =nba_players_df.merge(plus_minus_df, how="inner", on="PLAYER")
pie_df_subset = pie_df[["PLAYER", "PIE","PACE", "W"]].copy()
nba_players_df = nba_players_df.merge(pie_df_subset, how="inner", on="PLAYER")
salary_df.rename(columns={'NAME': 'PLAYER'}, inplace=True)salary_df["SALARY_MILLIONS"] =\
round(salary_df["SALARY"]/1000000, 2)salary_df.drop(["POSITION","TEAM", "SALARY"],inplace=True, axis=1)
salary_df.head()

In [None]:
diff = list(set(
nba_players_df["PLAYER"].values.tolist()) –
set(salary_df["PLAYER"].values.tolist()))
len(diff)
nba_players_with_salary_df =\nba_players_df.merge(salary_df);

In [None]:
nba_players_with_salary_df.columns
len(nba_players_with_salary_df.columns)

In [None]:
wiki_df = pd.read_csv("../data/nba_2017_player_wikipedia.csv")
wiki_df.rename(columns=\{'names': 'PLAYER', "pageviews": "PAGEVIEWS"}, inplace=True)
median_wiki_df = wiki_df.groupby("PLAYER").median()
median_wiki_df_small = median_wiki_df[["PAGEVIEWS"]]median_wiki_df_small.reset_index(level=0, inplace=True);median_wiki_df_sm.head()

In [None]:
twitter_df = pd.read_csv("../data/nba_2017_twitter_players.csv")
nba_players_with_salary_wiki_twitter_df=\nba_players_with_salary_wiki_df.merge(twitter_df)

In [None]:
len(nba_players_with_salary_wiki_twitter_df.columns)

In [None]:
plt.subplots(figsize=(20,15))
ax = plt.axes()
ax.set_title("NBA Player Correlation Heatmap")
corr = nba_players_with_salary_wiki_twitter_df.corr()
sns.heatmap(corr,
xticklabels=corr.columns.values,
yticklabels=corr.columns.values)

In [None]:
numerical_df =\nba_players_with_salary_wiki_twitter_df.loc[:,\
["AGE", "TRB", "AST", "STL", "TOV", "BLK", "PF", "POINTS",\
"MPG", "WINS_RPM", "W", "SALARY_MILLIONS", "PAGEVIEWS", \
"TWITTER_FAVORITE_COUNT"]].dropna()
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
print(scaler.fit(numerical_df))
print(scaler.transform(numerical_df))
MinMaxScaler(copy=True, feature_range=(0, 1))

In [None]:
from sklearn.cluster import KMeans
k_means = KMeans(n_clusters=5)
kmeans = k_means.fit(scaler.transform(numerical_df))
nba_players_with_salary_wiki_twitter_df['cluster'] = kmeans.labels_
nba_players_with_salary_wiki_twitter_df.to_csv("../data/nba_2017_players_social_with_clusters.csv")

In [None]:
nba_players_with_salary_wiki_twitter_df.to_csv("../data/nba_2017_players_social_with_clusters.csv")
endorsements = pd.read_csv("../data/nba_2017_endorsement_full_stats.csv")
plt.subplots(figsize=(20,15))
ax = plt.axes()
ax.set_title("NBA Player Endorsement, \Social Power, On-Court Performance, \Team Valuation Correlation Heatmap: 2016-2017 Season")
corr = endorsements.corr()
sns.heatmap(corr,
xticklabels=corr.columns.values,
yticklabels=corr.columns.values, cmap="copper")

In [None]:
from matplotlib.colors import LogNorm
plt.subplots(figsize=(20,15))
pd.set_option('display.float_format', lambda x: '%.3f' % x)
norm = LogNorm()
ax = plt.axes()
grid = endorsements.select_dtypes([np.number])
ax.set_title("NBA Player Endorsement,\Social Power, On-Court Performance,\
Team Valuation Heatmap: 2016-2017 Season")sns.heatmap(grid,annot=True,
yticklabels=endorsements["PLAYER"],fmt='g',cmap="Accent", cbar=False, norm=norm)