In [1]:
pip install pandas scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Firstly, Loading the dataset directly from the file
url = "https://raw.githubusercontent.com/dsrscientist/Data-Science-ML-Capstone-Projects/master/baseball.csv"
df = pd.read_csv(url)

# Will display the first few rows of the datasets
print("Dataset preview:")
print(df.head())

# Selecting the relevant features (as an input) and target variable (as an output)
features = df.iloc[:, 1:-1]  # Excluding the first column (Team) and the last column (Wins)
target = df["W"]

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Initialize the Linear Regression model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

# Display the evaluation metrics
print("\nModel Evaluation:")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R2): {r2}")

# Example: Predict the number of wins for a new set of input features
new_input = pd.DataFrame({
    "R": [800],
    "AB": [5500],
    "H": [1500],
    "2B": [300],
    "3B": [20],
    "HR": [200],
    "BB": [600],
    "SO": [1200],
    "SB": [100],
    "RA": [700],
    "ER": [650],
    "ERA": [3.5],
    "CG": [5],
    "SHO": [3],
    "SV": [40],
    "E": [70]
})

# Ensure that the column order of new_input matches that of the training data
new_input = new_input[X_train.columns]

predicted_wins = model.predict(new_input)
print("\nPredicted Wins for the New Input:")
print(predicted_wins[0])

Dataset preview:
    W    R    AB     H   2B  3B   HR   BB    SO   SB   RA   ER   ERA  CG  SHO  \
0  95  724  5575  1497  300  42  139  383   973  104  641  601  3.73   2    8   
1  83  696  5467  1349  277  44  156  439  1264   70  700  653  4.07   2   12   
2  81  669  5439  1395  303  29  141  533  1157   86  640  584  3.67  11   10   
3  76  622  5533  1381  260  27  136  404  1231   68  701  643  3.98   7    9   
4  74  689  5605  1515  289  49  151  455  1259   83  803  746  4.64   7   12   

   SV    E  
0  56   88  
1  45   86  
2  38   79  
3  37  101  
4  35   86  

Model Evaluation:
Mean Absolute Error (MAE): 3.8768471222721033
Mean Squared Error (MSE): 28.1250072805436
Root Mean Squared Error (RMSE): 5.303301545315295
R-squared (R2): 0.7862570694322208

Predicted Wins for the New Input:
115.22047610449962
