In [6]:
import numpy as np
import pandas as pd
import transformers
import torch
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

In [7]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

In [8]:
test = pd.read_csv("test.csv")

In [9]:
test.head()

Unnamed: 0,PRODUCT_ID,TITLE,BULLET_POINTS,DESCRIPTION,PRODUCT_TYPE_ID
0,604373,Manuel d'Héliogravure Et de Photogravure En Re...,,,6142
1,1729783,DCGARING Microfiber Throw Blanket Warm Fuzzy P...,[QUALITY GUARANTEED: Luxury cozy plush polyest...,<b>DCGARING Throw Blanket</b><br><br> <b>Size ...,1622
2,1871949,I-Match Auto Parts Front License Plate Bracket...,"[Front License Plate Bracket Made Of Plastic,D...",Replacement for The Following Vehicles:2020 LE...,7540
3,1107571,PinMart Gold Plated Excellence in Service 1 Ye...,[Available as a single item or bulk packed. Se...,Our Excellence in Service Lapel Pins feature a...,12442
4,624253,"Visual Mathematics, Illustrated by the TI-92 a...",,,6318


In [10]:
train = pd.read_csv("train.csv")

In [11]:
train.head()

Unnamed: 0,PRODUCT_ID,TITLE,BULLET_POINTS,DESCRIPTION,PRODUCT_TYPE_ID,PRODUCT_LENGTH
0,1925202,ArtzFolio Tulip Flowers Blackout Curtain for D...,[LUXURIOUS & APPEALING: Beautiful custom-made ...,,1650,2125.98
1,2673191,Marks & Spencer Girls' Pyjama Sets T86_2561C_N...,"[Harry Potter Hedwig Pyjamas (6-16 Yrs),100% c...",,2755,393.7
2,2765088,PRIKNIK Horn Red Electric Air Horn Compressor ...,"[Loud Dual Tone Trumpet Horn, Compatible With ...","Specifications: Color: Red, Material: Aluminiu...",7537,748.031495
3,1594019,ALISHAH Women's Cotton Ankle Length Leggings C...,[Made By 95%cotton and 5% Lycra which gives yo...,AISHAH Women's Lycra Cotton Ankel Leggings. Br...,2996,787.401574
4,283658,The United Empire Loyalists: A Chronicle of th...,,,6112,598.424


In [12]:
train.describe()


Unnamed: 0,PRODUCT_ID,PRODUCT_TYPE_ID,PRODUCT_LENGTH
count,2249698.0,2249698.0,2249698.0
mean,1499795.0,4000.456,4071.839
std,866194.4,3966.146,1351685.0
min,1.0,0.0,1.0
25%,749479.5,230.0,511.811
50%,1499558.0,2916.0,663.0
75%,2250664.0,6403.0,1062.992
max,2999999.0,13420.0,1885801000.0


In [13]:
test.describe()

Unnamed: 0,PRODUCT_ID,PRODUCT_TYPE_ID
count,734736.0,734736.0
mean,1493725.0,4001.628103
std,866977.3,3965.893339
min,0.0,0.0
25%,739673.5,228.0
50%,1492776.0,2916.0
75%,2242406.0,6396.0
max,2999998.0,13420.0


In [14]:
train.shape

(2249698, 6)

In [15]:
test.shape

(734736, 5)

In [16]:
train = train.sample(n=10000, random_state=123)

In [17]:
train = train[train['PRODUCT_LENGTH'] >= 0]
train['PRODUCT_LENGTH'] = np.log1p(train['PRODUCT_LENGTH'])

In [18]:
train_set, val_set = train_test_split(train, test_size=0.2, random_state=42)

In [19]:
vectorizer = TfidfVectorizer()
train_features = vectorizer.fit_transform(train_set['TITLE'].fillna('') + ' ' + train_set['DESCRIPTION'].fillna('') + ' ' + train_set['BULLET_POINTS'].fillna(''))
val_features = vectorizer.transform(val_set['TITLE'].fillna('') + ' ' + val_set['DESCRIPTION'].fillna('') + ' ' + val_set['BULLET_POINTS'].fillna(''))

In [20]:
dtr = DecisionTreeRegressor(random_state=42)

In [27]:
dtr.fit(train_features, train_set['PRODUCT_LENGTH'].fillna(0))

In [28]:
val_predictions = dtr.predict(val_features)

In [29]:
mse = mean_squared_error(val_set['PRODUCT_LENGTH'], val_predictions)
print("Mean Squared Error:", mse)

Mean Squared Error: 1.4825076278679103


In [30]:
test_features = vectorizer.transform(test['TITLE'].fillna('') + ' ' + test['DESCRIPTION'].fillna('') + ' ' + test['BULLET_POINTS'].fillna(''))
test_predictions = dtr.predict(test_features)
submission_df = pd.DataFrame({'PRODUCT_ID': test['PRODUCT_ID'], 'PRODUCT_LENGTH': test_predictions})

In [31]:
submission_df.to_csv("submissions_dtr.csv", index=False)

In [32]:
submission_df.shape

(734736, 2)

In [33]:
import sklearn.metrics as metrics
score = max( 0 , 100*(1-metrics.mean_absolute_percentage_error(val_set['PRODUCT_LENGTH'],val_predictions)))

In [34]:
print(score)

86.99658603977981
