In [1]:
import sys
sys.path.append("../../../../")

In [2]:
!pip install -q xgboost

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pricer.evaluate import evaluate
from pricer.items import Item
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm.notebook import tqdm

In [4]:
LITE_MODE = False
username = "ed-donner"
dataset = f"{username}/items_lite" if LITE_MODE else f"{username}/items_full"
train, val, test = Item.get_from_hub(dataset)
print(f"Loaded {len(train):,} training items, {len(val):,} validation items, {len(test):,} test items")

Loaded 800,000 training items, 10,000 validation items, 10,000 test items


In [5]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

# Decision Tree pipeline
decision_tree_pipeline = Pipeline(steps=[
    (
        "tfidf",
        TfidfVectorizer(
                    max_features=2000,
                    stop_words="english",
                    ngram_range=(1, 2),
                    sublinear_tf=True
                )
    ),
    (
        "model",
        DecisionTreeRegressor(max_depth=None, random_state=42)
    )
])

# Random Forest pipeline
random_forest_pipeline = Pipeline(steps=[
    (
        "tfidf",
        TfidfVectorizer(
                    max_features=2000,
                    stop_words="english",
                    ngram_range=(1, 2),
                    sublinear_tf=True
                )
    ),
    (
        "model",
        RandomForestRegressor(n_estimators=100, max_depth=None, random_state=42)
    )
])

# Gradient Boosting pipeline
gradient_boosting_pipeline = Pipeline(steps=[
    (
        "tfidf",
        TfidfVectorizer(
                    max_features=2000,
                    stop_words="english",
                    ngram_range=(1, 2),
                    sublinear_tf=True
                )
    ),
    (
        "model",
        GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
    )
])
# XGBoost pipeline
xgboost_pipeline = Pipeline(steps=[
    (
        "tfidf",
        TfidfVectorizer(
                    max_features=2000,
                    stop_words="english",
                    ngram_range=(1, 2),
                    sublinear_tf=True
                )
    ),
    (
        "model",
        XGBRegressor(
            n_estimators=100,
            learning_rate=0.1,
            max_depth=3,
            random_state=42,
            objective='reg:squarederror'
        )
    )
])

In [6]:
def get_dataframe(train, test):
    train_df = [{"text": item.summary, "price": float(item.price)} for item in tqdm(train)]
    test_df = [{"text": item.summary, "price": float(item.price)} for item in tqdm(test)]
    return pd.DataFrame(train_df), pd.DataFrame(test_df)

In [7]:
train_df , test_df  = get_dataframe(train , test)

  0%|          | 0/800000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

In [None]:
decision_tree_pipeline.fit(train_df['text'], train_df['price'])
random_forest_pipeline.fit(train_df['text'], train_df['price'])
gradient_boosting_pipeline.fit(train_df['text'], train_df['price'])
# xgboost_pipeline.fit(train_df['text'], train_df['price'])

In [8]:
def predict_dt(item):
    return decision_tree_pipeline.predict([item.summary])[0]

def predict_rf(item):
    return random_forest_pipeline.predict([item.summary])[0]

def predict_gb(item):
    return gradient_boosting_pipeline.predict([item.summary])[0]

def predict_xgb(item):
    return xgboost_pipeline.predict([item.summary])[0]

In [9]:
import pickle
import os

In [None]:
os.makedirs("models", exist_ok=True)
pipelines = {
    "decision_tree": decision_tree_pipeline,
    "random_forest": random_forest_pipeline,
    "gradient_boosting": gradient_boosting_pipeline,
    "xgboost": xgboost_pipeline
}

# Fit and save each pipeline
for name, pipeline in pipelines.items():
    # Save the pipeline as a .pkl file
    with open(f"models/{name}_pipeline.pkl", "wb") as f:
        pickle.dump(pipeline, f)

    print(f"{name} pipeline saved successfully!")

In [10]:
with open("models/decision_tree_pipeline.pkl", "rb") as f:
    decision_tree_pipeline = pickle.load(f)

with open("models/random_forest_pipeline.pkl", "rb") as f:
    random_forest_pipeline = pickle.load(f)

with open("models/gradient_boosting_pipeline.pkl", "rb") as f:
    gradient_boosting_pipeline = pickle.load(f)

with open("models/xgboost_pipeline.pkl", "rb") as f:
    xgboost_pipeline = pickle.load(f)



In [11]:
def predict_dt(item):
    return decision_tree_pipeline.predict([item.summary])[0]

def predict_rf(item):
    return random_forest_pipeline.predict([item.summary])[0]

def predict_gb(item):
    return gradient_boosting_pipeline.predict([item.summary])[0]

def predict_xgb(item):
    return xgboost_pipeline.predict([item.summary])[0]

In [None]:
evaluate(predict_dt, test)

In [None]:
evaluate(predict_rf, test)

In [None]:
evaluate(predict_gb, test)

In [None]:
evaluate(predict_xgb, test)

## We got 182$ from Random Forest which is best from these tree based models, we so far have got 169$ off from Ridge Regression.