In [None]:
import pandas as pd
import numpy as np
import os
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Flatten
from tensorflow.keras.applications import EfficientNetB0
from sklearn.preprocessing import LabelEncoder


In [None]:
df.to_csv('categories.csv', index=False)

print("Parquet file has been converted to CSV.")

In [None]:
category_attributes = {
    "Men Tshirts": ["color", "neck", "pattern", "print_or_pattern_type", "sleeve_length"],
    "Sarees": ["blouse_pattern", "border", "border_width", "color", "occasion", "ornamentation", "pallu_details", "pattern", "print_or_pattern_type", "transparency"],
    "Kurtis": ["color", "fit_shape", "length", "occasion", "ornamentation", "pattern", "print_or_pattern_type", "sleeve_length", "sleeve_styling"],
    "Women Tshirts": ["color", "fit_shape", "length", "pattern", "print_or_pattern_type", "sleeve_length", "sleeve_styling", "surface_styling"],
    "Women Tops & Tunics": ["color", "fit_shape", "length", "neck_collar", "occasion", "pattern", "print_or_pattern_type", "sleeve_length", "sleeve_styling", "surface_styling"]
}

train_df = pd.read_csv('train.meesho.csv')
train_df.columns = train_df.columns.str.strip()

attribute_classes = {attribute: [] for attribute in [
    "color", "neck", "pattern", "print_or_pattern_type", "sleeve_length", "blouse_pattern", 
    "border", "border_width", "occasion", "ornamentation", "pallu_details", "transparency", 
    "fit_shape", "length", "sleeve_styling", "surface_styling", "neck_collar"]}


for _, row in train_df.iterrows():
    category = row['Category']
    
    if category in category_attributes:
        attributes = category_attributes[category]
       
        for idx, attribute in enumerate(attributes):
            attr_column = f'attr_{idx + 1}' 
        
            if attr_column in train_df.columns:
                attribute_value = row[attr_column]
                
                # Check if the value is not NaN before adding to the list
                if pd.notna(attribute_value) and attribute_value not in attribute_classes[attribute]:
                    attribute_classes[attribute].append(attribute_value)


In [None]:
def encode_labels(df, category_attributes):
    label_encoders = {}
    
    # Initialize a new DataFrame for storing encoded values
    encoded = df.copy()

    for idx, row in df.iterrows():
        category = row['Category']
        
        if category in category_attributes:
            attributes = category_attributes[category]  # Get the attributes for this category
            
            for i, attr in enumerate(attributes):
                attr_column = f'attr_{i+1}'
                if attr_column in df.columns:
                    if attr not in label_encoders:
                        label_encoders[attr] = LabelEncoder()  # Initialize encoder for the attribute
                        label_encoders[attr].fit(df[attr_column].astype(str))  # Fit on the data
                    
                    # Apply encoding to the respective column
                    encoded[attr_column] = label_encoders[attr].transform(df[attr_column].astype(str))
                else:
                    print(f"Warning: {attr_column} not found in the DataFrame for {category}")

    return encoded, label_encoders

# Use the function to encode labels
train_encoded, label_encoders = encode_labels(train_df, category_attributes)

# Check the result
print(train_encoded.head())

datagen = ImageDataGenerator(rescale=1.0/255.0)

In [None]:
# Define model
base_model = EfficientNetB0(input_shape=(224, 224, 3), include_top=False, weights='imagenet')
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Flatten()(x)

outputs = [Dense(len(attribute_classes[attr]), activation='softmax', name=attr) for attr in attribute_classes]

# Create model
model = Model(inputs=base_model.input, outputs=outputs)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Define generator for training data
def image_generator(df, batch_size=32):
    while True:
        for start in range(0, len(df), batch_size):
            end = min(start + batch_size, len(df))
            batch_df = df.iloc[start:end]
            images = []
            labels = []
            for _, row in batch_df.iterrows():
                image_id = str(row['id']).zfill(6) + '.jpg' 
                img = tf.keras.preprocessing.image.load_img(os.path.join(train_image_dir, image_id), target_size=(224, 224))
                img = tf.keras.preprocessing.image.img_to_array(img)
                images.append(img)

                # Prepare the labels
                labels_batch = [train_labels[attr][i] for i, attr in enumerate(attribute_classes.keys())]
                labels.append(labels_batch)

            yield np.array(images), [np.array(label) for label in zip(*labels)]

train_generator = image_generator(train_df, batch_size=32)

In [None]:
# Train the model
model.fit(train_generator, steps_per_epoch=len(train_df) // 32, epochs=10)

# Predicting on test data
def predict_on_test(df):
    predictions = []
    for _, row in df.iterrows():
        image_id = str(row['id']).zfill(6) + '.jpg'  # Correct image filename
        img = tf.keras.preprocessing.image.load_img(os.path.join(test_image_dir, image_id), target_size=(224, 224))
        img = tf.keras.preprocessing.image.img_to_array(img) / 255.0
        img = np.expand_dims(img, axis=0)

        pred = model.predict(img)
        predictions.append(pred)

    return np.array(predictions)

test_predictions = predict_on_test(test_df)

In [None]:
# Format predictions and write to submission file
def format_submission(predictions, df):
    submission_data = []
    for i, (pred, category) in enumerate(zip(predictions, df['Category'])):
        attributes = []
        for j, attr in enumerate(attribute_classes.keys()):
            class_idx = np.argmax(pred[j]) 
            attribute_value = label_encoders[attr].inverse_transform([class_idx])[0]
            attributes.append(attribute_value)
        
        
        while len(attributes) < 10:
            attributes.append('dummy_value')
        
        
        row = [df['id'].iloc[i], category, len(attributes)] + attributes
        submission_data.append(row)
    
    
    submission_df = pd.DataFrame(submission_data, columns=['id', 'Category', 'len'] + [f'attr_{i+1}' for i in range(10)])
    submission_df.to_csv('/kaggle/working/submission.csv', index=False)


format_submission(test_predictions, test_df)