In [1]:
import sys
sys.path.append("../../../../")

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pricer.evaluate import evaluate
from pricer.items import Item
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge , Lasso , LinearRegression
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import FunctionTransformer,StandardScaler
from tqdm.notebook import tqdm

In [3]:
# username = "ujalaarshad17"
# dataset_name = f"{username}/items_transformed_full"
# train, val, test = Item.get_from_hub(dataset_name)
# print(f"Loaded {len(train):,} training items, {len(val):,} validation items, {len(test):,} test items")
LITE_MODE = False
username = "ed-donner"
dataset = f"{username}/items_lite" if LITE_MODE else f"{username}/items_full"


train, val, test = Item.get_from_hub(dataset)

print(f"Loaded {len(train):,} training items, {len(val):,} validation items, {len(test):,} test items")

Loaded 800,000 training items, 10,000 validation items, 10,000 test items


In [13]:
pipeline = Pipeline(steps=[
    (
        "features",
        FeatureUnion([
            (
                "tfidf",
                TfidfVectorizer(
                    max_features=2000,
                    stop_words="english",
                    ngram_range=(1, 2),
                    sublinear_tf=True
                )
            ),
            (
                "char_len",
                FunctionTransformer(
                    lambda x: np.array([len(t) for t in x]).reshape(-1, 1),
                    validate=False
                )
            )
        ])
    ),
    (
        "normalize",
        StandardScaler(with_mean=False)
    ),
    (
        "model",
        LinearRegression()
    )
])


In [4]:
def get_dataframe(train, test):
    train_df = [
        {
            "text": len(item.summary),
            "weight": item.weight,
            "weight_unknown": 1 if item.weight == 0 else 0,
            "price": float(item.price)
        }
        for item in tqdm(train)
    ]
    
    test_df = [
        {
            "text": len(item.summary),
            "weight": item.weight,
            "weight_unknown": 1 if item.weight == 0 else 0,
            "price": float(item.price)
        }
        for item in tqdm(test)
    ]
    
    return pd.DataFrame(train_df), pd.DataFrame(test_df)


In [5]:
train_df , test_df  = get_dataframe(train , test)

  0%|          | 0/800000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

In [6]:
train_df.shape

(800000, 4)

In [7]:
features = ['text', 'weight', 'weight_unknown']
lr = LinearRegression()
lr.fit(train_df[features], train_df['price'])

In [8]:
def predict_lr(item):
    x = pd.DataFrame([{
        "text": len(item.summary),
        "weight": item.weight,
        "weight_unknown": 1 if item.weight == 0 else 0
    }])
    return lr.predict(x)[0]

In [9]:
predict_lr(test[0])

163.62758186604358

In [10]:
evaluate(predict_lr, test)

  0%|          | 0/200 [00:00<?, ?it/s]

[93m$55 [92m$29 [91m$87 [91m$90 [91m$92 [93m$60 [92m$1 [93m$78 [91m$104 [91m$192 [91m$556 [91m$228 [91m$141 [91m$81 [93m$50 [91m$124 [93m$69 [91m$83 [93m$50 [92m$8 [92m$10 [92m$9 [93m$61 [92m$2 [91m$175 [91m$298 [91m$345 [91m$114 [92m$34 [93m$55 [91m$100 [93m$77 [92m$10 [92m$38 [92m$24 [91m$676 [93m$78 [91m$94 [93m$50 [91m$90 [93m$48 [93m$55 [91m$83 [93m$102 [93m$82 [91m$119 [91m$100 [91m$106 [92m$2 [93m$52 [91m$106 [92m$9 [91m$353 [92m$28 [91m$92 [92m$26 [91m$136 [91m$110 [92m$37 [91m$121 [91m$97 [93m$69 [93m$43 [92m$17 [91m$460 [93m$41 [93m$83 [91m$288 [92m$4 [91m$86 [91m$109 [91m$101 [91m$139 [91m$113 [91m$99 [91m$103 [92m$8 [91m$122 [91m$122 [91m$111 [92m$25 [91m$107 [91m$98 [92m$10 [91m$108 [93m$63 [93m$60 [91m$164 [92m$33 [91m$82 [91m$109 [93m$41 [91m$101 [91m$94 [91m$114 [91m$104 [91m$97 [92m$9 [91m$151 [91m$426 [92m$37 [92m$11 [91m$135 [92m$5 [91m$100 [92m$25 [91m$107 [91

In [11]:
def get_dataframe(train, test):
    train_df = [{"text": item.summary, "price": float(item.price)} for item in tqdm(train)]
    test_df  = [{"text": item.summary, "price": float(item.price)} for item in tqdm(test)]
    return pd.DataFrame(train_df), pd.DataFrame(test_df)

train_df, test_df = get_dataframe(train, test)


  0%|          | 0/800000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

In [14]:
pipeline.fit(train_df['text'], train_df['price'])

In [15]:
test[0].summary

'Title: Excess V2 Distortion/Modulation Pedal  \nCategory: Music Pedals  \nBrand: Old Blood Noise  \nDescription: A versatile pedal offering distortion and three modulation modes—delay, chorus, and harmonized fifths—with full control over signal routing and expression.  \nDetails: Features include separate gain, tone, and volume controls; time, depth, and volume per modulation; order switching, soft‑touch bypass, and expression jack for dynamic control.'

In [16]:
pipeline.predict([test[0].summary])[0]

156.74209098736836

In [17]:
def predict_lr(item):
    return pipeline.predict([item.summary])[0]

In [18]:
texts = [text.summary for text in test]

In [19]:
pipeline.predict(texts)

array([156.74209099, 237.56898016,  -8.07069967, ..., 303.75693473,
       224.11170403, 147.81299834])

In [20]:
evaluate(predict_lr, test)

  0%|          | 0/200 [00:00<?, ?it/s]

[93m$62 [91m$122 [93m$63 [92m$17 [93m$44 [91m$189 [92m$29 [92m$21 [92m$38 [92m$57 [91m$539 [91m$158 [91m$125 [91m$166 [92m$28 [91m$87 [93m$45 [92m$38 [93m$50 [92m$29 [93m$42 [92m$24 [92m$22 [92m$10 [91m$221 [91m$242 [93m$121 [92m$3 [92m$36 [93m$76 [93m$53 [91m$122 [92m$28 [92m$20 [91m$99 [91m$463 [92m$18 [93m$64 [91m$169 [93m$58 [91m$119 [93m$76 [93m$62 [92m$38 [91m$94 [91m$85 [92m$24 [93m$53 [92m$28 [92m$6 [92m$24 [91m$103 [93m$140 [92m$4 [93m$72 [91m$83 [93m$44 [91m$184 [92m$11 [92m$7 [92m$35 [92m$16 [92m$0 [93m$52 [91m$328 [91m$100 [92m$26 [91m$264 [92m$14 [91m$207 [92m$20 [92m$20 [92m$11 [91m$152 [93m$43 [92m$29 [91m$167 [92m$12 [92m$24 [91m$150 [91m$86 [93m$44 [92m$23 [93m$41 [92m$3 [91m$94 [93m$57 [93m$73 [93m$65 [91m$230 [92m$22 [91m$126 [91m$96 [92m$22 [92m$14 [93m$84 [92m$10 [92m$32 [91m$109 [93m$167 [93m$59 [93m$61 [92m$20 [92m$22 [92m$6 [92m$13 [93m$53 [91m$219 [92m$3

### for selecting 5000 features we get 177, for selecting 10000 features we get 197 and for 2000 we get 187$ off

### So we performed good as compared to our mean baseline but theres a way long, now moving to Ridge and Lasso Regression

## Now moving on to Lasso and Ridge

In [21]:
lasso_pipeline = Pipeline(steps=[
    (
        "features",
        FeatureUnion([
            (
                "tfidf",
                TfidfVectorizer(
                    max_features=2000,
                    stop_words="english",
                    ngram_range=(1, 2),
                    sublinear_tf=True
                )
            ),
            (
                "char_len",
                FunctionTransformer(
                    lambda x: np.array([len(t) for t in x]).reshape(-1, 1),
                    validate=False
                )
            )
        ])
    ),
    (
        "normalize",
        StandardScaler(with_mean=False)
    ),
    (
        "model",
        Lasso()
    )
])


# Ridge pipeline
ridge_pipeline = Pipeline(steps=[
    (
        "features",
        FeatureUnion([
            (
                "tfidf",
                TfidfVectorizer(
                    max_features=2000,
                    stop_words="english",
                    ngram_range=(1, 2),
                    sublinear_tf=True
                )
            ),
            (
                "char_len",
                FunctionTransformer(
                    lambda x: np.array([len(t) for t in x]).reshape(-1, 1),
                    validate=False
                )
            )
        ])
    ),
    (
        "normalize",
        StandardScaler(with_mean=False)
    ),
    (
        "model",
        Ridge()
    )
])


In [22]:
lasso_pipeline.fit(train_df['text'], train_df['price'])
def predict_lasso(item):
    return lasso_pipeline.predict([item.summary])[0]
evaluate(predict_lasso, test)

  0%|          | 0/200 [00:00<?, ?it/s]

[93m$52 [91m$82 [92m$17 [92m$24 [91m$87 [91m$157 [93m$43 [92m$15 [93m$58 [93m$93 [91m$546 [91m$186 [91m$94 [91m$157 [92m$21 [91m$92 [93m$49 [92m$39 [92m$38 [93m$48 [93m$48 [93m$61 [92m$14 [92m$21 [91m$210 [91m$276 [93m$110 [92m$33 [93m$61 [93m$50 [93m$49 [91m$152 [92m$13 [93m$54 [93m$44 [91m$485 [93m$59 [91m$83 [91m$162 [93m$78 [91m$123 [91m$96 [92m$20 [92m$7 [91m$116 [93m$70 [93m$55 [93m$65 [92m$27 [92m$18 [93m$49 [91m$90 [91m$194 [92m$1 [91m$105 [93m$57 [93m$58 [91m$190 [92m$13 [92m$3 [93m$75 [93m$46 [92m$13 [92m$24 [91m$407 [91m$106 [92m$9 [91m$270 [93m$53 [91m$166 [92m$21 [93m$40 [92m$37 [91m$148 [93m$61 [91m$104 [91m$157 [92m$12 [92m$23 [91m$115 [91m$101 [93m$66 [92m$23 [92m$35 [92m$18 [93m$43 [92m$32 [93m$88 [93m$54 [91m$203 [92m$11 [91m$94 [93m$57 [92m$24 [92m$27 [91m$103 [92m$21 [92m$15 [91m$133 [93m$194 [93m$54 [93m$46 [92m$26 [92m$24 [92m$14 [92m$33 [93m$53 [91m$199 [9

In [23]:
ridge_pipeline.fit(train_df['text'], train_df['price'])
def predict_ridge(item):
    return ridge_pipeline.predict([item.summary])[0]
evaluate(predict_ridge, test)

  0%|          | 0/200 [00:00<?, ?it/s]

[93m$62 [91m$122 [93m$63 [92m$17 [93m$44 [91m$189 [92m$29 [92m$21 [92m$38 [92m$57 [91m$539 [91m$158 [91m$125 [91m$166 [92m$28 [91m$87 [93m$45 [92m$38 [93m$50 [92m$29 [93m$42 [92m$24 [92m$22 [92m$9 [91m$221 [91m$242 [93m$121 [92m$3 [92m$36 [93m$76 [93m$53 [91m$122 [92m$28 [92m$20 [91m$99 [91m$463 [92m$18 [93m$64 [91m$169 [93m$58 [91m$119 [93m$76 [93m$62 [92m$38 [91m$94 [91m$85 [92m$24 [93m$53 [92m$28 [92m$6 [92m$24 [91m$102 [93m$140 [92m$4 [93m$72 [91m$83 [93m$44 [91m$185 [92m$11 [92m$7 [92m$35 [92m$16 [92m$0 [93m$52 [91m$329 [91m$100 [92m$26 [91m$264 [92m$14 [91m$207 [92m$20 [92m$20 [92m$11 [91m$152 [93m$43 [92m$29 [91m$167 [92m$12 [92m$24 [91m$150 [91m$86 [93m$44 [92m$23 [93m$41 [92m$3 [91m$94 [93m$57 [93m$73 [93m$65 [91m$229 [92m$22 [91m$126 [91m$96 [92m$22 [92m$14 [93m$84 [92m$10 [92m$32 [91m$109 [93m$167 [93m$59 [93m$61 [92m$20 [92m$22 [92m$6 [92m$13 [93m$53 [91m$219 [92m$35

# Final Remarks: LR and Ridge Regression almost performed the same so we got our best value till now and we are off by 75$. Los gehts zum SVM