In [None]:
# -----------------------------
# Import Libraries
# -----------------------------
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

# Optional: display plots inline
%matplotlib inline

In [None]:
# -----------------------------
# Load NBA Player Stats & Salaries Dataset
# -----------------------------
df = pd.read_csv('/Users/yusufshire/Downloads/NBA Player Stats and Salaries_2010-2025 2.csv')

# Preview first 5 rows
print("First 5 rows of data:")
display(df.head())

# Check for missing values in key columns
print("\nMissing values in key columns:")
print(df[['Salary', 'PTS', 'AST', 'ORB', 'DRB', 'Pos', 'Team', 'Age']].isnull().sum())

In [None]:
# -----------------------------
# Data Cleaning: Remove rows with missing key stats
# -----------------------------
# Drop rows with missing salary or stats
df = df.dropna(subset=['Salary', 'PTS', 'AST', 'ORB', 'DRB', 'Pos', 'Team', 'Age'])

# Check all nulls
print("Remaining nulls per column:")
print(df.isnull().sum())

In [None]:
# -----------------------------
# Feature Engineering: One-Hot Encode Categorical Columns
# -----------------------------
categorical_cols = ['Team', 'Pos']

# Initialize encoder
encoder = OneHotEncoder(drop='first', sparse_output=False)

# Encode categorical columns
encoded_array = encoder.fit_transform(df[categorical_cols])
encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out(categorical_cols))

# Drop original categorical columns + optional 'Player' and 'Season'
cols_to_drop = categorical_cols.copy()
if 'Player' in df.columns:
    cols_to_drop.append('Player')
if 'Season' in df.columns:
    cols_to_drop.append('Season')

df_numeric = df.drop(columns=cols_to_drop)

In [None]:
# -----------------------------
# Prepare Features (X) and Target (y)
# -----------------------------
df_final = pd.concat([df_numeric.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)

X = df_final.drop(columns=['Salary'])
y = df_final['Salary']

# Fill any remaining NaNs
X = X.fillna(0)
y = y.fillna(0)

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# -----------------------------
# Train Linear Regression Model
# -----------------------------
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

In [None]:
# -----------------------------
# Model Evaluation: Predicted vs Actual Salary
# -----------------------------
plt.figure(figsize=(8,6))
plt.scatter(y_test, y_pred, alpha=0.6)
plt.xlabel("Actual Salary")
plt.ylabel("Predicted Salary")
plt.title("NBA Player Salary Prediction")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.grid(True)
plt.show()

In [None]:
# -----------------------------
# Create Experience Level Column
# -----------------------------
df['ExperienceLevel'] = df['Age'].apply(lambda x: 'Rookie' if x <= 23 else 'Veteran')

# Average Salary by Experience Level
avg_salary_by_exp = df.groupby('ExperienceLevel')['Salary'].mean()
print("Average Salary by Experience Level:")
print(avg_salary_by_exp)

In [None]:
# -----------------------------
# Visualize Average Salary: Rookie vs Veteran
# -----------------------------
avg_salary_by_exp.plot(kind='bar', color=['skyblue', 'salmon'], title='Average Salary: Rookie vs Veteran')
plt.ylabel("Average Salary")
plt.show()