In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import numpy as np

# Load data
players = pd.read_csv('./Dataset/players.csv')
teams = pd.read_csv('./Dataset/teams.csv')
players_teams = pd.read_csv('./Dataset/players_teams.csv')
awards_players = pd.read_csv('./Dataset/awards_players.csv')
coaches = pd.read_csv('./Dataset/coaches.csv')
series_post = pd.read_csv('./Dataset/series_post.csv')
teams_post = pd.read_csv('./Dataset/teams_post.csv')


In [None]:
print("PLAYERS:")
players.info()
players.describe()

In [None]:

print("TEAMS:")
teams.info()
teams.describe()

In [None]:
print("PLAYERS_TEAMS:")
players_teams.info()
players_teams.describe()

In [None]:
print("AWARDS_PLAYERS:")
number_of_team_ids = awards_players['award'].nunique()
print(number_of_team_ids)
awards_players.info()
awards_players.describe()

In [None]:
print("COACHES:")
coaches.info()
coaches.describe()

In [None]:
print("SERIES_POST:")
series_post.info()
series_post.describe()

In [None]:
print("TEAMS_POST:")
teams_post.info()
teams_post.describe()

## Outliers


In [None]:

fig, axs = plt.subplots(ncols=2, figsize=(10,5))
for i, attribute in enumerate(players.columns[4:6]):
    col = i % 5
    sns.boxplot(x=players[attribute], ax=axs[col])

fig.suptitle("Boxplots for Player Attributes", fontsize=12)

fig, axs = plt.subplots(nrows=6, ncols=6, figsize=(23, 30))
for i, attribute in enumerate(players_teams.columns[8:45]):
    row = i // 6
    col = i % 6
    sns.boxplot(x=players_teams[attribute], ax=axs[row][col])



---

auxiliary functions:

In [None]:
def plot_two_histograms(df,first_attr,sec_attr):

    if sec_attr is None:
        n_bins1 = int(np.ceil(1 + np.log2(len(df[first_attr]))))
        sns.histplot(data=df, x=first_attr, bins=n_bins1)
        return

    fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(10, 5))

    #Sturges' rule
    n_bins1 = int(np.ceil(1 + np.log2(len(df[first_attr]))))
    n_bins2 = int(np.ceil(1 + np.log2(len(df[sec_attr]))))

    sns.histplot(data=df, x=first_attr, ax=ax1, bins=n_bins1)
    sns.histplot(data=df, x=sec_attr, ax=ax2, bins=n_bins2)
    plt.show()

def plot_three_barplots(data, first_attr, sec_attr, thr_attr):
    fig, ax = plt.subplots(ncols=3, figsize=(12,8))
    ax[0].bar(data[first_attr].value_counts().index, data[first_attr].value_counts().values)
    ax[0].set_title(first_attr)
    ax[1].bar(data[sec_attr].value_counts().index, data[sec_attr].value_counts().values)
    ax[1].set_title(sec_attr)
    ax[2].bar(data[thr_attr].value_counts().index, data[thr_attr].value_counts().values)
    ax[2].set_title(thr_attr)
    plt.show()

## Missing Values

#### Players

The players who the height is 0 also have the other features set to 0 or null.
The column "lastseason" only contains zero values.

In [None]:

null_heights = players[players['height'] ==  0]
print(null_heights)

zero_percentages = (players == 0).mean() * 100
# Plot the percentages of zeros
plt.figure(figsize=(10, 6))
zero_percentages.plot(kind='bar', color='skyblue')
plt.title('Percentage of Zeros in Players')
plt.ylabel('Percentage of Zeros')
plt.show()


In [None]:
msno.bar(players)
plt.title('Missing Values in Players', fontsize=20)
plt.show()

In [None]:


# Sort the DataFrame by player and year
players_teams = players_teams.sort_values(by=['playerID', 'year', 'tmID'])

# Initialize variables to keep track of the current team and player
current_player = ""
current_team = ""
start_year = 0
team_durations = []

# Iterate through the DataFrame to calculate the years on each team
for index, row in players_teams.iterrows():
    player = row['playerID']
    team = row['tmID']
    year = row['year']

    if player != current_player:
        # New player, reset variables
        current_player = player
        current_team = team
        start_year = year
    elif team != current_team:
        # Player changed teams, calculate duration and update current team
        duration = year - start_year
        team_durations.append(duration)
        current_team = team
        start_year = year

# Create a histogram to visualize the distribution of years on one team
plt.figure(figsize=(12, 6))
plt.hist(team_durations, bins=range(min(team_durations), max(team_durations) + 1), edgecolor='k')
plt.xlabel('Years on One Team')
plt.ylabel('Number of Players')
plt.title('Years players spent on a team')
plt.xticks(range(min(team_durations), max(team_durations) + 1))
plt.show()


#### Teams

In [None]:

msno.bar(teams)
plt.title('Missing Values in Teams', fontsize=20)
plt.show()

zero_percentages = ((teams == 0).mean() * 100).sort_values()
# Plot the percentages of zeros
plt.figure(figsize=(10, 6))
zero_percentages.plot(kind='bar', color='skyblue')
plt.title('Percentage of Zeros in Teams')
plt.ylabel('Percentage of Zeros')
plt.show()


### Other tables


In [None]:
# Check for missing values
print("---Missing values---")

print("AWARDS_PLAYERS:")
print(awards_players.isnull().sum())

print("COACHES:")
print(coaches.isnull().sum())

In [None]:
print("PLAYERS_TEAMS:")
print(players_teams.isnull().sum())

In [None]:
print("SERIES_POST:")
print(series_post.isnull().sum())

### Distribution of Numeric Variables

#### Players

In [None]:
# Function to plot 2 consecutive count plots
def two_countplots(df,first_attr,sec_attr):
    fig, axes = plt.subplots(1, 2, figsize=(12, 8))

    sns.countplot(x=first_attr, data=df, ax=axes[0])
    sns.countplot(x=sec_attr, data=df, ax=axes[1])

    for ax in axes:
        for p in ax.patches:
            height = p.get_height()
            ax.text(p.get_x() + p.get_width() / 2,
                    height*1.01,
                    f"{height / len(df) * 100:.1f}%",
                    ha="center")

    plt.show()

two_countplots(players, "firstseason", "lastseason")
plot_two_histograms(players,"height","weight")


### Coaches

In [None]:
plot_two_histograms(coaches, "won", "lost")
plot_two_histograms(coaches,"post_wins","post_losses")

### Distribution of Categorical Variables

players

In [None]:
sns.countplot(x="pos", data=players)
plt.title('Distribution of pos')
plt.show()

unique_values_count = players['college'].nunique()
print(f'Number of unique values in college column: {unique_values_count}')

Teams

In [None]:
sns.countplot(x="confID", data=teams)
plt.title('Distribution of confID')
plt.show()

sns.countplot(x="franchID", data=teams)
plt.title('Distribution of franchID')
plt.show()

plt.figure(figsize=(12, 8))
sns.countplot(y="name", data=teams, palette="colorblind")
plt.title("Distribution of name teams")
plt.xlabel("Count")
plt.ylabel("Name")
plt.show()


plt.figure(figsize=(12, 8))
sns.countplot(y="arena", data=teams, palette="colorblind")
plt.title("Distribution of arena")
plt.xlabel("Count")
plt.ylabel("Arena")
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
sns.countplot(y="award", data=awards_players, palette="viridis")
plt.title("Distribution of Awards")
plt.xlabel("Count")
plt.ylabel("Award")
plt.show()

#### Correlation Matrix

Correlation matrix to observe the relationship between the target and the continuous features

In [None]:
df = teams[['year','playoff', 'rank', 'o_fgm', 'o_fga', 'o_ftm', 'o_fta', 'o_3pm', 'o_3pa', 'o_oreb', 'o_dreb', 'o_reb', 'o_asts', 'o_pf', 'o_stl', 'o_to', 'o_blk', 'o_pts', 'd_fgm', 'd_fga', 'd_ftm', 'd_fta', 'd_3pm', 'd_3pa', 'd_oreb', 'd_dreb', 'd_reb', 'd_asts', 'd_pf', 'd_stl', 'd_to', 'd_blk', 'd_pts', 'tmORB', 'tmDRB', 'tmTRB', 'opptmORB', 'opptmDRB', 'opptmTRB', 'won', 'lost', 'GP', 'homeW', 'homeL', 'awayW', 'awayL', 'confW', 'confL', 'min', 'attend']]
df.loc[:, 'playoff'] = df['playoff'].replace({'N': 0, 'Y': 1})

# Calculate correlation matrix
correlation_matrix = df.corr()
# Display correlation matrix

plt.figure(figsize=(35, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.3)
plt.show()

## Target Distribution

The dataset is slightly unbalanced

In [None]:
sns.countplot(x='playoff', data=teams)
plt.title('Target Distribution')

plt.show()

In [None]:
playoff_teams = teams[teams['playoff'] == 'Y']

# Group by team and count the number of playoff appearances and finals wins
playoff_counts = playoff_teams.groupby('name')['playoff'].count().reset_index(name='playoff_appearances')
finals_wins = playoff_teams[playoff_teams['finals'] == 'W'].groupby('name')['finals'].count().reset_index(name='finals_won')

# Merge the two counts into a single DataFrame
merge_counts = pd.merge(playoff_counts, finals_wins, on='name', how='left')

# Fill NaN values with 0 for teams that didn't win finals

# Plot the data
plt.bar(merge_counts['name'], merge_counts['playoff_appearances'], label='Playoff Appearances')
plt.bar(merge_counts['name'], merge_counts['finals_won'], label='Finals Wins')

plt.xlabel('Team')
plt.ylabel('Count')
plt.title('Playoff Appearances vs Finals Wins')
plt.legend()
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better visibility
plt.tight_layout()

# Show the plot
plt.show()

