In [51]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# קריאת הקובץ merged_booking_final.csv
df = pd.read_csv("merged_booking_final.csv")

# הצצה ראשונית
df.head()
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45400 entries, 0 to 45399
Data columns (total 18 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   hotel_name                    45385 non-null  object 
 1   star_rating                   45011 non-null  float64
 2   rating_score                  45374 non-null  float64
 3   location_score                26850 non-null  float64
 4   review_amount                 45346 non-null  float64
 5   price                         45387 non-null  object 
 6   breakfast_included            45400 non-null  bool   
 7   free_cancellation             45400 non-null  bool   
 8   no_prepayment_needed          45400 non-null  bool   
 9   centrally_located             45400 non-null  bool   
 10  sustainability_certification  45400 non-null  bool   
 11  checkin                       45400 non-null  object 
 12  checkout                      45400 non-null  object 
 13  g

In [52]:
#נסתכל כמה NaN יש ב-star_rating
print("Missing in star_rating:", df["star_rating"].isna().sum())

Missing in star_rating: 389


In [53]:
target_col = "price_numeric"

# נשמור רק את השורות שבהן price_numeric אינו NaN
df = df.dropna(subset=[target_col])


# עמודות נומריות אפשריות
numeric_cols = [
    "star_rating", 
    "rating_score", 
    "location_score", 
    "review_amount", 
    "distance_score"
]

# עמודות בוליאניות
bool_cols = [
    "breakfast_included",
    "free_cancellation",
    "no_prepayment_needed",
    "centrally_located",
    "sustainability_certification"
]

# נכין את X ו-y
drop_cols = [
    target_col,        # כי זה ה-Target
    "hotel_name",      # טקסט שאין בו צורך מודלי
    "price",           # הגרסה הגולמית
    "price_clean",     # גם עמודת עזר
    "checkin", "checkout",
    "group_index"
]

# בודקים שהעמודות קיימות
drop_cols = [c for c in drop_cols if c in df.columns]

X = df.drop(columns=drop_cols, errors="ignore")
y = df[target_col]


In [54]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.3, 
    random_state=42
)


In [55]:
from sklearn.impute import SimpleImputer

# Pipeline לעמודות נומריות
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),   # מילוי NaN בממוצע
    ("scaler", StandardScaler())
])

# עבור העמודות הבוליאניות, נניח pass-through (כי 0/1)
bool_transformer = "passthrough"

# הגדרת ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numeric_cols),
    ("bool", bool_transformer, bool_cols)
])


In [56]:
def evaluate_model(model_name, model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"Model: {model_name}")
    print(f"  MSE:  {mse:.2f}")
    print(f"  RMSE: {rmse:.2f}")
    print(f"  MAE:  {mae:.2f}")
    print(f"  R^2:  {r2:.3f}")
    print("-"*40)

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline

# Pipeline ל-LinearRegression
pipe_lr = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", LinearRegression())
])

# Pipeline ל-DecisionTree
pipe_dt = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", DecisionTreeRegressor(random_state=42))
])

# הרצה והערכה
for name, pipeline in [
    ("LinearRegression", pipe_lr), 
    ("DecisionTree", pipe_dt)
]:
    evaluate_model(name, pipeline, X_train, X_test, y_train, y_test)


Model: LinearRegression
  MSE:  7088654.85
  RMSE: 2662.45
  MAE:  1836.05
  R^2:  0.065
----------------------------------------
Model: DecisionTree
  MSE:  6496531.48
  RMSE: 2548.83
  MAE:  1762.84
  R^2:  0.143
----------------------------------------


In [57]:
from sklearn.model_selection import GridSearchCV

param_grid_dt = {
    "regressor__max_depth": [3, 5, 10, None],
    "regressor__min_samples_split": [2, 5, 10]
}

pipe_dt_gs = GridSearchCV(pipe_dt, param_grid_dt, 
                          scoring="neg_mean_squared_error", 
                          cv=3,
                          n_jobs=-1)

pipe_dt_gs.fit(X_train, y_train)

print("Best params for DecisionTree:", pipe_dt_gs.best_params_)

# הערכה על סט הבדיקה
best_dt = pipe_dt_gs.best_estimator_
evaluate_model("Best DecisionTree", best_dt, X_train, X_test, y_train, y_test)


Best params for DecisionTree: {'regressor__max_depth': 10, 'regressor__min_samples_split': 5}
Model: Best DecisionTree
  MSE:  6440102.35
  RMSE: 2537.74
  MAE:  1766.17
  R^2:  0.151
----------------------------------------
