In [None]:
import pandas as pd
import numpy as np
import requests
import re
import urllib.parse
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
import time
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import pandas.plotting as pd_plotting
import plotly.express as px

In [None]:
boardGames = pd.read_csv("boardgamesData.csv")

In [None]:
boardGames["GroupSize"] = pd.Categorical(boardGames["GroupSize"], categories = ["Individual", "Small", "Large", "Massive"], ordered = True)
boardGames["Time Category"] = pd.Categorical(boardGames["Time Category"], categories = ["Quick", "Short", "Moderate", "Long", "Very Long", "Marathon"], ordered = True)
boardGames["AgeRating"] = pd.Categorical(boardGames["AgeRating"], categories = ["Young", "PreTeen", "Teen", "Adult", "Any"], ordered = True)



In [None]:
boardGames

In [None]:
sns.scatterplot(data=boardGames, x = "Average USD Price", y = "Number of Ratings")

Create data frames without outliers for better visualization

In [None]:

boardgamesnew = boardGames.drop(index = [156, 51, 93, 200])
boardgamesplayers = boardGames.drop(index = [131, 164, 473, 823])
boardgamesyears = boardgamesnew.drop(index = [49, 441, 690, 816, 916, 949])

In [None]:
#sns.scatterplot(data=boardGames, x = "Bayes Rating", y = "Average USD Price")
#plt.savefig("EDAPlots/PriceByRating.png")

sns.scatterplot(data=boardgamesyears, x = "Year Published", y = "Number of Ratings")
plt.title("Number of Ratings by Year Published")
plt.savefig("EDAPlots/PopularitybyYear.png")

In [None]:
sns.violinplot(data=boardgamesnew, y = "Average USD Price", x = "Time Category")
plt.title("Distributions of Price by Length of Time")
plt.savefig("EDAPlots/PriceDistbyTime")

In [None]:
boardgamesnew.loc[boardgamesnew["Average USD Price"] > 1200]

In [None]:
boardGames["Age (Years)"].idxmax()
boardGames.loc[200]

In [None]:
sns.displot(data = boardGames, x = "Number of Ratings")

In [None]:
sns.set(font_scale =1.0) 
sns.heatmap(data = boardGames.corr(numeric_only=True), cmap="YlGnBu", annot = True, annot_kws = {'size': 8}, square = True).set(title="Correlation Heatmap")
plt.xticks(rotation=90, fontsize = 8)
plt.savefig("EDAPlots/correlationmatrix.png")

In [None]:
sns.set(font_scale =1.0) 
sns.heatmap(data = boardgamesnew.corr(numeric_only=True), cmap="YlGnBu", annot = True, annot_kws = {'size': 8}, square = True).set(title="Correlation Heatmap")
plt.xticks(rotation=90, fontsize = 8)
plt.savefig("EDAPlots/correlationmatrixnew.png")

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(9, 6))
sns.violinplot(data=boardGames, y = "Number of Ratings", palette = "Greens", ax=axes[0, 0])
axes[0,0].set_title("Distributions of Number of Ratings of Top 1000 Board Games")
sns.violinplot(data=boardGames, y = "Number of Ratings", x = "Time Category", palette = "Blues", ax=axes[1, 0])
axes[1,0].set_title("Distributions of Number of Ratings by Length of Time")
sns.violinplot(data=boardGames, y = "Number of Ratings", x = "GroupSize", palette="Reds", ax =axes[0,1])
axes[0,1].set_title("Distributions of Number of Ratings by Size of Group")
sns.violinplot(data=boardGames, y = "Number of Ratings", x = "AgeRating", palette="Oranges", ax =axes[1,1])
axes[1,1].set_title("Distributions of Number of Ratings by Minimum Age")
plt.tight_layout()  # Adjust layout to prevent overlap
plt.show()
fig.savefig("EDAPlots/ViolinPlots.png")

In [None]:
sns.violinplot(data=boardGames, y = "Bayes Rating", x = "GroupSize", palette="Reds")
plt.title("Distributions of Rating by Size of Group")
plt.savefig("EDAPlots/RatingDistbySize")

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(12, 6))
sns.histplot(data = boardGames, x = "GroupSize", ax = axes[0])
axes[0].set_title("Counts of Each Player Group Size Category", fontsize = 11)
#plt.savefig("EDAPlots/SizeCatCounts.png")
sns.histplot(data = boardGames, x = "Time Category", ax = axes[1])
axes[1].set_xticklabels(labels=["Quick", "Short", "Moderate", "Long", "Very Long", "Marathon"], rotation = 45)
axes[1].set_title("Counts of Each Time Length Category", fontsize = 11)
#plt.savefig("EDAPlots/TimeCatCounts.png")
sns.histplot(data = boardGames, x = "AgeRating", ax = axes[2])
axes[2].set_title("Counts of Each Minimum Age Category", fontsize = 11)
#plt.savefig("EDAPlots/AgeCatCounts.png")

#plt.savefig("EDAPlots/AgeCatCounts.png")
plt.tight_layout()  # Adjust layout to prevent overlap
plt.show()
fig.savefig("EDAPlots/CatHists.png")

In [None]:
size_order = ["Individual", "Small", "Large", "Massive"]
sns.violinplot(data=boardgamesnew, y = "Standard Deviation", x = "GroupSize", palette="Reds", order = size_order)
plt.title("Distributions of Standard Deviation by Size of Group")
plt.savefig("EDAPlots/RatingVariancebySize")

In [None]:
boardgamesnew_filled = boardgamesnew
boardgamesnew_filled["Average USD Price"] = boardgamesnew["Average USD Price"].replace(np.nan, boardgamesnew["Average USD Price"].mean())

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(12, 6))

sns.histplot(data = boardgamesyears, x = "Year Published", ax = axes[0])
axes[0].set_title("Histogram of Year Published", fontsize = 11)
#plt.savefig("EDAPlots/SizeCatCounts.png")
sns.histplot(data = boardGames, x = "Min Players", ax = axes[1])
axes[1].set_title("Histogram of Min Players", fontsize = 11)
#plt.savefig("EDAPlots/TimeCatCounts.png")
sns.histplot(data = boardgamesplayers, x = "Max Players", ax = axes[2])
axes[2].set_title("Histogram of Max Players", fontsize = 11)

plt.tight_layout()  # Adjust layout to prevent overlap
plt.show()
fig.savefig("EDAPlots/ContHistplots1.png")

In [None]:
boardGames["Age (Years)"].median()

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(12, 6))

sns.histplot(data = boardGames, x = "Playing Time", ax = axes[0])
axes[0].set_title("Histogram of Playing Time", fontsize = 11)
#plt.savefig("EDAPlots/SizeCatCounts.png")
sns.histplot(data = boardGames, x = "Age Minimum", ax = axes[1])
axes[1].set_title("Histogram of Min Age", fontsize = 11)
#plt.savefig("EDAPlots/TimeCatCounts.png")
sns.histplot(data = boardGames, x = "Number of Accessories", ax = axes[2])
axes[2].set_title("Histogram of Number of Accessories", fontsize = 11)

plt.tight_layout()  # Adjust layout to prevent overlap
plt.show()
fig.savefig("EDAPlots/ContHistplots2.png")

In [None]:
boardGames["Playing Time"].median()
boardGames["Age Minimum"].median()

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 6))

sns.histplot(data = boardGames, x = "Number of Ratings", ax = axes[0,0])
axes[0,0].set_title("Histogram of Number of Ratings", fontsize = 11)
#plt.savefig("EDAPlots/SizeCatCounts.png")
sns.histplot(data = boardGames, x = "Average Rating", ax = axes[0, 1])
axes[0,1].set_title("Histogram of Average Rating", fontsize = 11)
#plt.savefig("EDAPlots/TimeCatCounts.png")
sns.histplot(data = boardGames, x = "Bayes Rating", ax = axes[1, 0])
axes[1,0].set_title("Histogram of Bayes Rating", fontsize = 11)
sns.histplot(data = boardgamesnew, x = "Average USD Price", ax = axes[1, 1])
axes[1,1].set_title("Histogram of Average price (USD)", fontsize = 11)
plt.tight_layout()  # Adjust layout to prevent overlap
plt.show()
fig.savefig("EDAPlots/ContHistplots3.png")

In [None]:
X1 = boardgamesnew_filled.iloc[:, np.r_[2, 4:8, 11]]
Y = boardgamesnew.iloc[:, 9]
X = sm.add_constant(X1)
model = sm.OLS(Y, X).fit()
print(model.summary())

In [None]:
boardGames.loc[boardGames["Year Published"] < 1950].index

In [None]:
boardGames.loc[boardGames["Max Players"] > 80].index

In [None]:
test = boardGames.groupby("Year Published")
