In [3]:
pip install kaggle

Collecting kaggle
  Downloading kaggle-1.6.17.tar.gz (82 kB)
     ---------------------------------------- 0.0/82.7 kB ? eta -:--:--
     ---- ----------------------------------- 10.2/82.7 kB ? eta -:--:--
     -------------- ----------------------- 30.7/82.7 kB 325.1 kB/s eta 0:00:01
     ---------------------------- --------- 61.4/82.7 kB 465.5 kB/s eta 0:00:01
     -------------------------------------- 82.7/82.7 kB 513.8 kB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py): started
  Building wheel for kaggle (setup.py): finished with status 'done'
  Created wheel for kaggle: filename=kaggle-1.6.17-py3-none-any.whl size=105795 sha256=acba921b273a531760d659b0459ff26159dae3f580e0245b5d8c66faaa09bd51
  Stored in directory: c:\users\win\appdata\local\pip\cache\wheels\46\d2\26\84d0a1acdb9c6baccf7d28cf06962ec80529fe1ad938489983
Successful

In [31]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load dataset
DATASET_PATH = r"D:\artificial\styles_synthetic.csv"
df = pd.read_csv(DATASET_PATH)

# Display dataset sample
print("\n Dataset Sample:")
print(df.head())

# Handle missing values
df.dropna(inplace=True)

# Encode categorical columns
label_encoders = {}  # Store encoders for inverse transformation if needed
categorical_cols = ['gender', 'subCategory', 'articleType', 'baseColour', 'season', 'usage']

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])  # Convert text to numbers
    label_encoders[col] = le  # Store encoder for future decoding

# Encode target variable
df['masterCategory_encoded'] = LabelEncoder().fit_transform(df['masterCategory'])

# Define features and target variable
X = df.drop(columns=['id', 'masterCategory', 'productDisplayName'])
y = df['masterCategory_encoded']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest model with controlled accuracy
model = RandomForestClassifier(
    n_estimators=5,        # Fewer trees
    max_depth=1,           # Shallow trees to prevent overfitting
    min_samples_split=50,  # Require more samples to split
    min_samples_leaf=20,   # Increase minimum samples per leaf
    max_features="log2",   # Randomly limit the number of features per tree
    bootstrap=True,        # Enable bootstrapping
    max_samples=0.5,       # Use 50% of dataset per tree
    class_weight="balanced",  # Prevent dominant classes from overfitting
    random_state=42
)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f"\n Model Accuracy: {accuracy:.2f}")



 Dataset Sample:
   id  gender masterCategory subCategory        articleType baseColour  \
0   1   Women        Apparel     Topwear            Jackets       Blue   
1   2     Men       Footwear     Sandals    Leather Sandals       Grey   
2   3   Women       Footwear  Flip Flops  Casual Flip Flops       Grey   
3   4  Unisex       Footwear       Shoes              Boots     Purple   
4   5  Unisex       Footwear       Shoes              Boots      White   

   season  year   usage                productDisplayName  
0  Winter  2023   Party              Zara Jackets in Blue  
1  Winter  2011  Formal      Puma Leather Sandals in Grey  
2  Winter  2012  Formal  Adidas Casual Flip Flops in Grey  
3  Winter  2010  Sports              Puma Boots in Purple  
4    Fall  2013  Sports               Levi Boots in White  

 Model Accuracy: 0.85
