# Step 4: Model Training - BaselineModel

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score


In [2]:
# Load data
yelp_df_business = pd.read_csv('../data/processed_data/train_data_business.csv')
yelp_df_business.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,...,Fast Food,BusinessAcceptsCreditCards,BikeParking,RestaurantsPriceRange2,BusinessParking,garage,street,validated,lot,valet
0,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,...,0,True,True,2,"{'garage': False, 'street': False, 'validated'...",False,False,False,True,False
1,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,...,0,False,True,1,"{'garage': False, 'street': True, 'validated':...",False,True,False,False,False
2,n_0UpQx1hsNbnPUSlodU8w,Famous Footwear,"8522 Eager Road, Dierbergs Brentwood Point",Brentwood,MO,63144,38.627695,-90.340465,2.5,13,...,0,True,True,2,"{'garage': False, 'street': False, 'validated'...",False,False,False,True,False
3,UJsufbvfyfONHeWdvAHKjA,Marshalls,21705 Village Lakes Sc Dr,Land O' Lakes,FL,34639,28.190459,-82.45738,3.5,6,...,0,True,True,2,"{'garage': False, 'street': False, 'validated'...",False,False,False,False,False
4,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771,27.916116,-82.760461,4.5,100,...,0,True,True,1,"{'garage': False, 'street': False, 'validated'...",False,False,False,True,False


In [3]:
# Print all column names to check what's available
print(yelp_df_business.columns)


Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'review_count', 'is_open',
       'attributes', 'categories', 'hours', 'composite_rating',
       'operational_hours', 'Restaurants', 'Food', 'Shopping', 'Beauty & Spas',
       'Home Services', 'Nightlife', 'Bars', 'Health & Medical',
       'Local Services', 'Event Planning & Services', 'Automotive',
       'Sandwiches', 'American (Traditional)', 'Coffee & Tea', 'Pizza',
       'Fast Food', 'BusinessAcceptsCreditCards', 'BikeParking',
       'RestaurantsPriceRange2', 'BusinessParking', 'garage', 'street',
       'validated', 'lot', 'valet'],
      dtype='object')


In [26]:
# Selecting all numerical columns as predictors
X = yelp_df_business.select_dtypes(include=['float64', 'int64'])

In [27]:
columns_to_drop = ["stars", "composite_rating", "review_count"]


In [28]:
# Removing the 'stars' column from the predictors
X = X.drop(columns=columns_to_drop)
y = yelp_df_business.composite_rating

In [29]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [30]:
# Creating the linear regression model
model = LinearRegression()

# Training the model on the training data
model.fit(X_train, y_train)

# Making predictions on the test data
y_pred = model.predict(X_test)

In [31]:
# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

Mean Squared Error: 21.83006440832407
R^2 Score: 0.30372394591879626


In [14]:
# Ridge Regression
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)
y_ridge_pred = ridge.predict(X_test)

mse_ridge = mean_squared_error(y_test, y_ridge_pred)
r2_ridge = r2_score(y_test, y_ridge_pred)

print("\nRidge Regression:")
print(f"Mean Squared Error: {mse_ridge}")
print(f"R^2 Score: {r2_ridge}")



Ridge Regression:
Mean Squared Error: 0.5331043522368498
R^2 Score: 0.1787272138397682


In [15]:
# Lasso Regression
lasso = Lasso(alpha=1.0)
lasso.fit(X_train, y_train)
y_lasso_pred = lasso.predict(X_test)

mse_lasso = mean_squared_error(y_test, y_lasso_pred)
r2_lasso = r2_score(y_test, y_lasso_pred)

print("\nLasso Regression:")
print(f"Mean Squared Error: {mse_lasso}")
print(f"R^2 Score: {r2_lasso}")



Lasso Regression:
Mean Squared Error: 0.5859817802804005
R^2 Score: 0.09726700352240769


In [16]:
# ElasticNet
elastic_net = ElasticNet(alpha=1.0, l1_ratio=0.5)
elastic_net.fit(X_train, y_train)
y_elastic_pred = elastic_net.predict(X_test)

mse_elastic = mean_squared_error(y_test, y_elastic_pred)
r2_elastic = r2_score(y_test, y_elastic_pred)

print("\nElasticNet:")
print(f"Mean Squared Error: {mse_elastic}")
print(f"R^2 Score: {r2_elastic}")



ElasticNet:
Mean Squared Error: 0.5848522525412345
R^2 Score: 0.09900709510015926


In [17]:
# Support Vector Regression
svr = SVR(kernel='rbf')
svr.fit(X_train, y_train)
y_svr_pred = svr.predict(X_test)

mse_svr = mean_squared_error(y_test, y_svr_pred)
r2_svr = r2_score(y_test, y_svr_pred)

print("\nSupport Vector Regression:")
print(f"Mean Squared Error: {mse_svr}")
print(f"R^2 Score: {r2_svr}")


Support Vector Regression:
Mean Squared Error: 0.5095049267756051
R^2 Score: 0.2150832589911802
