In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

Loading dataset

In [2]:
df=pd.read_csv('new_data.csv')
df

Unnamed: 0,gender,dress_type,age_range,ad_category
0,male,modern,(15-20),casual wear
1,female,modern,(38-43),luxury
2,male,modern,(50+),electronics
3,male,casual,(44-50),sportswear
4,male,formal,(33-37),formal wear
...,...,...,...,...
522,female,casual,(21-24),casual wear
523,male,modern,(33-37),electronics
524,female,casual,(15-20),casual wear
525,male,sportswear,(38-43),sportswear


In [3]:
for i in ["gender","dress_type","age_range","ad_category"]:
    print(df[i].value_counts())

gender
female    270
male      257
Name: count, dtype: int64
dress_type
casual        153
formal        133
modern        127
sportswear    114
Name: count, dtype: int64
age_range
(44-50)    87
(50+)      79
(38-43)    77
(15-20)    75
(25-32)    73
(21-24)    71
(33-37)    65
Name: count, dtype: int64
ad_category
casual wear           243
sportswear            126
formal wear            43
electronics            39
luxury                 38
travel and leisure     38
Name: count, dtype: int64


Checking for missing values

In [4]:
df.isna().sum()

gender         0
dress_type     0
age_range      0
ad_category    0
dtype: int64

Checking the data type of each feature

In [5]:
df.dtypes

gender         object
dress_type     object
age_range      object
ad_category    object
dtype: object

Converting object type features into numeric values

In [6]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
for i in ["gender","dress_type","age_range","ad_category"]:
    df[i]=le.fit_transform(df[i])

In [7]:
for i in ["gender","dress_type","age_range","ad_category"]:
    print(df[i].value_counts())

gender
0    270
1    257
Name: count, dtype: int64
dress_type
0    153
1    133
2    127
3    114
Name: count, dtype: int64
age_range
5    87
6    79
4    77
0    75
2    73
1    71
3    65
Name: count, dtype: int64
ad_category
0    243
4    126
2     43
1     39
3     38
5     38
Name: count, dtype: int64


In [8]:
df.corr()

Unnamed: 0,gender,dress_type,age_range,ad_category
gender,1.0,0.04242,0.025465,0.350936
dress_type,0.04242,1.0,0.050768,-0.011381
age_range,0.025465,0.050768,1.0,0.29703
ad_category,0.350936,-0.011381,0.29703,1.0


Spliting features from labels

In [9]:
X=df.iloc[:,:-1]
y=df.iloc[:,-1]

Scaling features

In [10]:
scaler=MinMaxScaler()
X_scaled=scaler.fit_transform(X)


Splitting data for training and testing

Building model

Evaluating model performance

In [11]:

rf=RandomForestClassifier(random_state=1)

rf.fit(X_scaled,y)
y_pred=rf.predict(X_scaled)
print(rf)
print(classification_report(y,y_pred))


RandomForestClassifier(random_state=1)
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       243
           1       1.00      1.00      1.00        39
           2       1.00      1.00      1.00        43
           3       1.00      1.00      1.00        38
           4       1.00      1.00      1.00       126
           5       1.00      1.00      1.00        38

    accuracy                           1.00       527
   macro avg       1.00      1.00      1.00       527
weighted avg       1.00      1.00      1.00       527



In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = rf_model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy with Random Forest: {accuracy:.2f}")


Validation Accuracy with Random Forest: 1.00


In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30]
}

grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                           param_grid=param_grid,
                           cv=3,
                           scoring='accuracy')
grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best accuracy: {grid_search.best_score_:.2f}")

Best parameters: {'max_depth': 10, 'n_estimators': 50}
Best accuracy: 1.00


In [14]:
grid_search.predict(scaler.transform([[0,2,4]]))



array([3])

In [15]:
pred=grid_search.predict(scaler.transform([[0,0,2]]))
pred[0]



0

In [16]:
import joblib

# Save the model
joblib.dump(grid_search, 'random_forest_model.pkl')

# Save the scaler
joblib.dump(scaler, 'rf_scaler.pkl')

['rf_scaler.pkl']