In [21]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from textblob import TextBlob
import joblib


import warnings
warnings.filterwarnings(action="ignore")

In [22]:
train_df = pd.read_csv('/kaggle/input/thrifting-clothes-dataset-with-mrp/thrifting_clothes_dataset_with_mrp.csv')

In [23]:
train_df.head()

Unnamed: 0,Category,Style,Brand,Size,Fit Type,Material,Gender,Condition,Wear and Tear,Age,MRP,Second Hand Price
0,Bottoms,Formal,Nike,XL,Oversized,Silk,Women's,Like New,Fading,less than 1,292.76,174.5
1,Tops,Festival,Ralph Lauren,XL,Regular Fit,Wool,Men's,Good,Fading,3+ yr,194.99,90.66
2,Bottoms,Casual,Nike,XL,Regular Fit,Wool,Women's,Good,Fading,less than 1,328.73,154.59
3,Bottoms,Formal,Ralph Lauren,S,Oversized,Cotton,Unisex,Fair,Fading,1-3 yr,164.19,86.0
4,Tops,Formal,H&M,L,Slim Fit,Silk,Men's,Good,Stains,less than 1,362.83,210.91


In [24]:
train_df.columns

Index(['Category', 'Style', 'Brand', 'Size', 'Fit Type', 'Material', 'Gender',
       'Condition', 'Wear and Tear', 'Age', 'MRP', 'Second Hand Price'],
      dtype='object')

In [25]:
train_df.shape

(5000, 12)

In [26]:
train_df.describe()

Unnamed: 0,MRP,Second Hand Price
count,5000.0,5000.0
mean,273.252198,142.530886
std,130.173195,70.01152
min,50.04,16.13
25%,159.4125,83.5425
50%,272.065,140.955
75%,386.2425,198.385
max,499.98,351.9


In [27]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Category           5000 non-null   object 
 1   Style              5000 non-null   object 
 2   Brand              5000 non-null   object 
 3   Size               5000 non-null   object 
 4   Fit Type           5000 non-null   object 
 5   Material           5000 non-null   object 
 6   Gender             5000 non-null   object 
 7   Condition          5000 non-null   object 
 8   Wear and Tear      3731 non-null   object 
 9   Age                5000 non-null   object 
 10  MRP                5000 non-null   float64
 11  Second Hand Price  5000 non-null   float64
dtypes: float64(2), object(10)
memory usage: 468.9+ KB


In [28]:
# Encode categorical features
categorical_features = ['Category', 'Style', 'Brand', 'Condition','Gender', 'Size', 'Fit Type', 'Material', 'Wear and Tear', 'Age']
label_encoders = {}

In [29]:
for feature in categorical_features:
    le = LabelEncoder()
    train_df[feature] = le.fit_transform(train_df[feature])
    label_encoders[feature] = le

In [30]:
train_df.describe()

Unnamed: 0,Category,Style,Brand,Size,Fit Type,Material,Gender,Condition,Wear and Tear,Age,MRP,Second Hand Price
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,0.9964,1.5112,2.0366,2.004,1.0094,1.5116,0.9888,0.9904,1.5262,1.0112,273.252198,142.530886
std,0.815917,1.122197,1.407003,1.420418,0.816483,1.126286,0.825597,0.822218,1.110838,0.818785,130.173195,70.01152
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.04,16.13
25%,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,159.4125,83.5425
50%,1.0,2.0,2.0,2.0,1.0,2.0,1.0,1.0,2.0,1.0,272.065,140.955
75%,2.0,3.0,3.0,3.0,2.0,3.0,2.0,2.0,3.0,2.0,386.2425,198.385
max,2.0,3.0,4.0,4.0,2.0,3.0,2.0,2.0,3.0,2.0,499.98,351.9


In [31]:
train_df.head()

Unnamed: 0,Category,Style,Brand,Size,Fit Type,Material,Gender,Condition,Wear and Tear,Age,MRP,Second Hand Price
0,0,2,2,3,0,2,2,2,0,2,292.76,174.5
1,2,1,3,3,1,3,0,1,0,1,194.99,90.66
2,0,0,2,3,1,3,2,1,0,2,328.73,154.59
3,0,2,3,2,0,0,1,0,0,0,164.19,86.0
4,2,2,0,0,2,2,0,1,2,2,362.83,210.91


In [32]:
train_df.dtypes

Category               int64
Style                  int64
Brand                  int64
Size                   int64
Fit Type               int64
Material               int64
Gender                 int64
Condition              int64
Wear and Tear          int64
Age                    int64
MRP                  float64
Second Hand Price    float64
dtype: object

In [33]:
# Split the data into features and target variable
X = train_df.drop(columns=['Second Hand Price'])
y = train_df['Second Hand Price']

# Standardize the numerical features if necessary
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [34]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
# Initialize and train the model
model = XGBRegressor()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

In [36]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

In [37]:
rmse, r2

(7.76696656733423, 0.9882036688209116)

In [38]:
# Save the model and encoders for future use
joblib.dump(model, 'price_prediction_model.pkl')

['price_prediction_model.pkl']