In [1]:
from google.colab import drive
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, mean_absolute_error, mean_squared_error, r2_score
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE


In [2]:
# Mount Google Drive
drive.mount('/content/drive')

# Load dataset
file_path = '/content/drive/My Drive/Colab Notebooks/Project 4/data/airbnb_cleaned.csv'
df = pd.read_csv(file_path)

Mounted at /content/drive


In [3]:
# Load the dataset
df_class = df.copy()

In [4]:
# Convert Numeric Ratings to Categorical Labels
def categorize_reviews(rating):
    if rating >= 4.5:
        return "A"
    elif rating >= 4.0:
        return "B"
    elif rating >= 3.5:
        return "C"
    elif rating >= 3.0:
        return "D"
    else:
        return "F"

# Apply review categorization
df_class['review_category'] = df_class['review rate number'].apply(categorize_reviews)

In [5]:
# Remove rows where 'review_category' is 'F'
df_filtered = df_class[df_class['review_category'] != 'F'].copy()

# Verify the change
print(df_filtered['review_category'].value_counts())  # Check the remaining categories

review_category
A    22921
B    22879
D    22842
Name: count, dtype: int64


In [6]:
# Identify and remove high-price outliers (above 95th percentile)
high_price_threshold = np.percentile(df_filtered['price'], 95)
df_filtered = df_filtered[df_filtered['price'] <= high_price_threshold].copy()

# Display basic statistics of price
print(df_filtered['price'].describe())

count    65277.000000
mean       596.554606
std        314.228309
min         50.000000
25%        326.000000
50%        595.000000
75%        870.000000
max       1143.000000
Name: price, dtype: float64


In [7]:
# Define features (X) and target variable (y)
# Drop Unnecessary Columns
X = df_filtered.drop(columns=['review rate number', 'review_category'])  # Features
y = df_filtered['review_category']  # Target variable


In [8]:
# Encode Categorical Features (One-Hot Encoding)
X = pd.get_dummies(X, drop_first=True)


In [9]:
# Encode target variable (A, B, C, D → 0, 1, 2, 3)
label_encoder = LabelEncoder()
y_numeric = label_encoder.fit_transform(y)

# Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y_numeric, test_size=0.2, random_state=42, stratify=y_numeric)

In [10]:
# Initialize and Train Linear Regression Model
lr = LinearRegression()
lr.fit(X_train, y_train)

# Predictions
y_pred_lr = lr.predict(X_test)

# Convert continuous predictions to closest integer category
y_pred_lr_rounded = np.round(y_pred_lr).astype(int)
y_pred_lr_categorized = label_encoder.inverse_transform(y_pred_lr_rounded)

# Evaluate Model Performance
print("\nLinear Regression Results:")
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test, y_pred_lr))
print("Mean Squared Error (MSE):", mean_squared_error(y_test, y_pred_lr))
print("R-Squared Score:", r2_score(y_test, y_pred_lr))

# Show classification report
print("\nClassification Report for Linear Regression:")
print(classification_report(label_encoder.inverse_transform(y_test), y_pred_lr_categorized))


Linear Regression Results:
Mean Absolute Error (MAE): 0.6800624720634336
Mean Squared Error (MSE): 0.6696478644454751
R-Squared Score: -0.0025271540844735707

Classification Report for Linear Regression:
              precision    recall  f1-score   support

           A       0.67      0.00      0.00      4380
           B       0.33      1.00      0.50      4335
           D       0.53      0.00      0.00      4341

    accuracy                           0.33     13056
   macro avg       0.51      0.33      0.17     13056
weighted avg       0.51      0.33      0.17     13056



In [11]:
# ---- LightGBM Model ----
lgbm = LGBMClassifier(n_estimators=200, learning_rate=0.1, max_depth=10, random_state=42)
lgbm.fit(X_train, y_train)
y_pred_lgbm = lgbm.predict(X_test)

print("LightGBM Accuracy:", accuracy_score(y_test, y_pred_lgbm))
print("\nLightGBM Classification Report:\n", classification_report(y_test, y_pred_lgbm))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004648 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2237
[LightGBM] [Info] Number of data points in the train set: 52221, number of used features: 168
[LightGBM] [Info] Start training from score -1.092142
[LightGBM] [Info] Start training from score -1.102469
[LightGBM] [Info] Start training from score -1.101258
LightGBM Accuracy: 0.46300551470588236

LightGBM Classification Report:
               precision    recall  f1-score   support

           0       0.46      0.49      0.47      4380
           1       0.46      0.46      0.46      4335
           2       0.47      0.45      0.46      4341

    accuracy                           0.46     13056
   macro avg       0.46      0.46      0.46     13056
weighted avg       0.46      0.46      0.46     13056



In [12]:
# ---- XGBoost Model ----

# Encode labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Train XGBoost again with numeric labels
xgb = XGBClassifier(n_estimators=200, learning_rate=0.1, max_depth=10, random_state=42)
xgb.fit(X_train, y_train_encoded)
y_pred_xgb = xgb.predict(X_test)

# Convert predictions back to original labels
y_pred_xgb_labels = label_encoder.inverse_transform(y_pred_xgb)

# Print results
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb_labels))
print("\nXGBoost Classification Report:\n", classification_report(y_test, y_pred_xgb_labels))




XGBoost Accuracy: 0.5366881127450981

XGBoost Classification Report:
               precision    recall  f1-score   support

           0       0.53      0.54      0.54      4380
           1       0.54      0.52      0.53      4335
           2       0.54      0.55      0.54      4341

    accuracy                           0.54     13056
   macro avg       0.54      0.54      0.54     13056
weighted avg       0.54      0.54      0.54     13056



In [13]:
# ---- Random Forest Model ----
rf = RandomForestClassifier(n_estimators=300, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("\nRandom Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nRandom Forest Classification Report:\n", classification_report(y_test, y_pred_rf))


Random Forest Accuracy: 0.6207873774509803

Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.64      0.63      4380
           1       0.62      0.61      0.61      4335
           2       0.63      0.62      0.62      4341

    accuracy                           0.62     13056
   macro avg       0.62      0.62      0.62     13056
weighted avg       0.62      0.62      0.62     13056



In [14]:
# ---- Optimized Random Forest Model ----
rf_optimized = RandomForestClassifier(n_estimators=300, max_depth=15, min_samples_split=5, random_state=42)
rf_optimized.fit(X_train, y_train)
y_pred_rf_opt = rf_optimized.predict(X_test)

print("Optimized Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf_opt))
print("\nOptimized Random Forest Classification Report:\n", classification_report(y_test, y_pred_rf_opt))


Optimized Random Forest Accuracy: 0.45358455882352944

Optimized Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.42      0.58      0.49      4380
           1       0.47      0.39      0.43      4335
           2       0.49      0.39      0.43      4341

    accuracy                           0.45     13056
   macro avg       0.46      0.45      0.45     13056
weighted avg       0.46      0.45      0.45     13056



In [15]:
# ---- Feature Importance Analysis ----
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': rf.feature_importances_})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

In [16]:
# Select most important features (above threshold)
important_features = feature_importance_df[feature_importance_df['Importance'] >= 0.02]['Feature'].tolist()
X_optimized = X[important_features]

In [17]:
# Retrain with selected features
X_train_opt, X_test_opt, y_train_opt, y_test_opt = train_test_split(X_optimized, y, test_size=0.2, random_state=42, stratify=y)
rf.fit(X_train_opt, y_train_opt)
y_pred_opt = rf.predict(X_test_opt)
print("\nNew Optimized Feature Set Accuracy:", accuracy_score(y_test_opt, y_pred_opt))
print("\nOptimized Feature Set Classification Report:\n", classification_report(y_test_opt, y_pred_opt))


New Optimized Feature Set Accuracy: 0.6500459558823529

Optimized Feature Set Classification Report:
               precision    recall  f1-score   support

           A       0.64      0.66      0.65      4380
           B       0.65      0.64      0.64      4335
           D       0.66      0.65      0.66      4341

    accuracy                           0.65     13056
   macro avg       0.65      0.65      0.65     13056
weighted avg       0.65      0.65      0.65     13056



In [18]:
# ---- SMOTE for Handling Class Imbalance ----
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [19]:
# Train Random Forest on Balanced Data
rf_smote = RandomForestClassifier(n_estimators=100, random_state=42)
rf_smote.fit(X_resampled, y_resampled)
y_pred_smote = rf_smote.predict(X_test)
print("\nBalanced Data Accuracy:", accuracy_score(y_test, y_pred_smote))
print("\nBalanced Data Classification Report:\n", classification_report(y_test, y_pred_smote))


Balanced Data Accuracy: 0.6109068627450981

Balanced Data Classification Report:
               precision    recall  f1-score   support

           0       0.60      0.63      0.61      4380
           1       0.62      0.59      0.60      4335
           2       0.62      0.61      0.62      4341

    accuracy                           0.61     13056
   macro avg       0.61      0.61      0.61     13056
weighted avg       0.61      0.61      0.61     13056

