In [None]:
                                        #baseball project

In [35]:
import pandas as pd

# Load the dataset

url = "https://raw.githubusercontent.com/FlipRoboTechnologies/ML_-Datasets/main/Baseball/baseball.csv"

baseball_data = pd.read_csv(url)

# Display the first few rows of the dataset

print("First few rows of the dataset:")

print(baseball_data.head())

First few rows of the dataset:
    W    R    AB     H   2B  3B   HR   BB    SO   SB   RA   ER   ERA  CG  SHO  \
0  95  724  5575  1497  300  42  139  383   973  104  641  601  3.73   2    8   
1  83  696  5467  1349  277  44  156  439  1264   70  700  653  4.07   2   12   
2  81  669  5439  1395  303  29  141  533  1157   86  640  584  3.67  11   10   
3  76  622  5533  1381  260  27  136  404  1231   68  701  643  3.98   7    9   
4  74  689  5605  1515  289  49  151  455  1259   83  803  746  4.64   7   12   

   SV    E  
0  56   88  
1  45   86  
2  38   79  
3  37  101  
4  35   86  


In [36]:
# Check for missing values
print("\nMissing values in the dataset:")
print(baseball_data.isnull().sum())


Missing values in the dataset:
W      0
R      0
AB     0
H      0
2B     0
3B     0
HR     0
BB     0
SO     0
SB     0
RA     0
ER     0
ERA    0
CG     0
SHO    0
SV     0
E      0
dtype: int64


In [37]:
# Summary statistics
print("\nSummary statistics of numerical features:")
print(baseball_data.describe())


Summary statistics of numerical features:
                W           R           AB            H          2B  \
count   30.000000   30.000000    30.000000    30.000000   30.000000   
mean    80.966667  688.233333  5516.266667  1403.533333  274.733333   
std     10.453455   58.761754    70.467372    57.140923   18.095405   
min     63.000000  573.000000  5385.000000  1324.000000  236.000000   
25%     74.000000  651.250000  5464.000000  1363.000000  262.250000   
50%     81.000000  689.000000  5510.000000  1382.500000  275.500000   
75%     87.750000  718.250000  5570.000000  1451.500000  288.750000   
max    100.000000  891.000000  5649.000000  1515.000000  308.000000   

              3B          HR          BB          SO          SB          RA  \
count  30.000000   30.000000   30.000000    30.00000   30.000000   30.000000   
mean   31.300000  163.633333  469.100000  1248.20000   83.500000  688.233333   
std    10.452355   31.823309   57.053725   103.75947   22.815225   72.108005 

In [38]:
# Import necessary libraries for modeling

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error

# Prepare the data for modeling

X = baseball_data.drop(columns=['W'])  # Features

y = baseball_data['W']  # Target variable

# Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [39]:
# Initialize the Random Forest Regressor model
model = RandomForestRegressor(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("\nMean Squared Error on the test set:", mse)


Mean Squared Error on the test set: 51.967916666666675


In [None]:
                                            #hyper parameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the grid of hyperparameters
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


In [40]:
# Initialize the Random Forest Regressor model

model = RandomForestRegressor(random_state=42)

# Initialize GridSearchCV

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Perform GridSearchCV to find the best hyperparameters

grid_search.fit(X_train, y_train)

# Get the best hyperparameters

best_params = grid_search.best_params_

print("\nBest Hyperparameters:", best_params)

# Use the best hyperparameters to train the final model

best_model = grid_search.best_estimator_

best_model.fit(X_train, y_train)

# Predict on the test set using the final model

y_pred_final = best_model.predict(X_test)




Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}


In [41]:
# Evaluate the final model

mse_final = mean_squared_error(y_test, y_pred_final)

print("\nMean Squared Error on the test set (final model):", mse_final)



Mean Squared Error on the test set (final model): 58.03666666666668


In [None]:
                                    #cross vaildation

In [43]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

# Initialize models
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "SVR": SVR(),
    "KNN": KNeighborsRegressor()
}

# Perform cross-validation for each model
for model_name, model in models.items():
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
    cv_scores = -cv_scores  # Convert negative MSE scores to positive
    mean_cv_score = cv_scores.mean()
    std_cv_score = cv_scores.std()
    print(f"{model_name} - Cross-Validation Mean Squared Error: {mean_cv_score}, Standard Deviation: {std_cv_score}")


Linear Regression - Cross-Validation Mean Squared Error: 620.717061139371, Standard Deviation: 1029.9813695143064
Decision Tree - Cross-Validation Mean Squared Error: 124.75999999999999, Standard Deviation: 79.50792664885684
Random Forest - Cross-Validation Mean Squared Error: 46.74389699999998, Standard Deviation: 26.355675604097023
SVR - Cross-Validation Mean Squared Error: 106.40779048455019, Standard Deviation: 41.337556302963556
KNN - Cross-Validation Mean Squared Error: 61.40480000000002, Standard Deviation: 31.669756161991533
