Import libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from data_cleaning import clean_tabular

Clean dataset using clean_tabular.py - removing unnecessary characters, decapitalisation, lemmatisation, removing stop words

In [2]:
clean = clean_tabular.CleanTabular('./data/Products.csv')
clean.clean()

Checking the shape

In [3]:
clean.df.shape

(7132, 10)

Setting my X varibles and target

In [4]:
X = clean.df[['product_name', 'product_description', 'location']]
y = clean.df['price']

Creating a transformer for each text column to put in the pipeline

In [5]:
transformer = ColumnTransformer(
    [('vect1', TfidfVectorizer(), 'product_name'),
     ('vect2', TfidfVectorizer(), 'product_description'),
     ('vect3', TfidfVectorizer(), 'location')],
    remainder='passthrough'
)

Creating pipeline with transformer and linear regression model

In [6]:
pipeline = Pipeline(
    [
        ("colt", transformer),
        ("lr", LinearRegression())
    ]
)

Setting parameters for grid search

In [7]:
parameters = {
    'colt__vect1__ngram_range': ((1, 1), (1, 2)),
    'colt__vect1__min_df': (1, 2),
    'colt__vect1__norm': ['l1', 'l2'],
    'colt__vect2__ngram_range': ((1, 2,), (1, 3)),
    'colt__vect2__min_df': (1, 2),
    'colt__vect2__norm': ['l1', 'l2'],
    'colt__vect3__norm': ['l1', 'l2']
}

Creating grid search

In [8]:
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

Train test split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [10]:
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5)

In [11]:
X_train.shape

(4992, 3)

In [12]:
y_train.shape

(4992,)

Run grid search

In [13]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 128 candidates, totalling 640 fits


Took 71m for:
<br>parameters = {
<br>    'colt__vect1__ngram_range': ((1, 1), (1, 2)),
<br>    'colt__vect1__min_df': (1, 2, 3, 5),
<br>    'colt__vect1__norm': ['l1', 'l2'],
<br>    'colt__vect2__ngram_range': ((1, 1), (1, 2,), (1, 3)),
<br>    'colt__vect2__min_df': (1, 2, 3, 5),
<br>    'colt__vect2__norm': ['l1', 'l2'],
<br>    'colt__vect3__norm': ['l1', 'l2']
<br>}
<br>param search so will narrow down as ran into MemoryError.
<br>Best params result was:
<br>{'colt__vect1__min_df': 1,
<br> 'colt__vect1__ngram_range': (1, 2),
<br> 'colt__vect1__norm': 'l1',
<br> 'colt__vect2__min_df': 2,
<br> 'colt__vect2__ngram_range': (1, 3),
<br> 'colt__vect2__norm': 'l1',
<br> 'colt__vect3__norm': 'l2'}
<br>Now trying:
<br>parameters = {
<br>    'colt__vect1__ngram_range': ((1, 1), (1, 2)),
<br>    'colt__vect1__min_df': (1, 2),
<br>    'colt__vect1__norm': ['l1', 'l2'],
<br>    'colt__vect2__ngram_range': ((1, 2,), (1, 3)),
<br>    'colt__vect2__min_df': (1, 2),
<br>    'colt__vect2__norm': ['l1', 'l2'],
<br>    'colt__vect3__norm': ['l1', 'l2']
<br>}

In [None]:
grid_search.best_score_

-56.58382233916626

The best score is worse than before (-6) so I will use the previous best params

In [None]:
grid_search.best_params_

{'colt__vect1__min_df': 2,
 'colt__vect1__ngram_range': (1, 2),
 'colt__vect1__norm': 'l1',
 'colt__vect2__min_df': 1,
 'colt__vect2__ngram_range': (1, 3),
 'colt__vect2__norm': 'l2',
 'colt__vect3__norm': 'l1'}

Previous best params:
<br>{'colt__vect1__min_df': 1,
<br> 'colt__vect1__ngram_range': (1, 2),
<br> 'colt__vect1__norm': 'l1',
<br> 'colt__vect2__min_df': 2,
<br> 'colt__vect2__ngram_range': (1, 3),
<br> 'colt__vect2__norm': 'l1',
<br> 'colt__vect3__norm': 'l2'}

In [None]:
grid_search.best_estimator_

In [3]:
v = TfidfVectorizer()
x = v.fit_transform(clean.df['product_name'])
df_v_pn = pd.DataFrame(x.toarray(), columns=v.get_feature_names_out())
v = TfidfVectorizer()
x = v.fit_transform(clean.df['product_description'])
df_v_pd = pd.DataFrame(x.toarray(), columns=v.get_feature_names_out())
v = TfidfVectorizer()
x = v.fit_transform(clean.df['location'])
df_v_l = pd.DataFrame(x.toarray(), columns=v.get_feature_names_out())
clean.df = pd.concat([clean.df.reset_index(drop=True), df_v_pn.reset_index(drop=True), df_v_pd.reset_index(drop=True), df_v_l.reset_index(drop=True)], axis=1)

Using default params due to memoryerrors

In [4]:
X = clean.df.drop(clean.df.columns[0], axis=1)
X = X.drop(['id', 'product_name', 'category', 'product_description',
       'price', 'location', 'url', 'page_id', 'create_time'], axis=1)
y = clean.df['price']

In [5]:
reg = LinearRegression()

Transformed the text columns for tfidf vectorization using the best params from grid search, created X and y and a Linear Regression model instance

Train test split for linear regression model

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [7]:
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5)

Fit the model

In [8]:
reg.fit(X_train, y_train)

In [9]:
reg.score(X, y)

0.6295578788928321

Okay score!

Intercept:

In [10]:
reg.intercept_

array([-1.08310036e+04,  9.51153635e-04])

Coefficient:

In [11]:
reg.coef_

array([[ 1.07536077e+04,  6.48925010e+04, -5.24615483e+02, ...,
         8.88496082e+03,  1.57828153e+05, -3.32395141e+04],
       [-1.45756440e-03, -1.90868210e-03,  2.97067954e-06, ...,
         3.31979312e-04, -1.10926664e-03, -9.43482102e-05]])

Make prediction using test data:

In [12]:
y_pred = reg.predict(X_test)

Evaluation

In [13]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', metrics.r2_score(y_test, y_pred))

Mean Absolute Error: 36507.716160664975
Mean Squared Error: 10345259847.045334
Root Mean Squared Error: -14116.922303486475
