In [3]:
#step 1 Loading + Merging

In [3]:
import os
import pandas as pd
import numpy as np

# Load the data
df_return_reasons = pd.read_parquet('return_reasons.parquet')
df_reviews = pd.read_parquet('reviews.parquet')
df_returns = pd.read_parquet('returns.parquet')
df_products = pd.read_parquet('products.parquet')
df_test = pd.read_parquet('test.parquet')

# Merge the datasets
df = pd.merge(df_returns, df_reviews, on=['order_item_id', 'product_id', 'customer_id'], how='left')
df.head()
df = pd.merge(df, df_products, on='product_id', how='left')
df.head()

Unnamed: 0,id,product_id,cause,comment,date_created_x,order_item_id,customer_id,purchase_price,review_text,shop_id,rating,date_created_y,category_id,category_title,product_description
0,69138,6a1a7601fac958ee967c73fe19315db8f6cdc3f1cd8370...,DEFECTED,брак,2023-01-02 05:37:33.846,588140,b4465ede5691891836ccc432bb8c49e1537b1d0a74f721...,106000,,,,NaT,a073f12ea698964c47150c2b8f5fa937c639bdc54e223a...,"Yogʻochni kuydirib naqsh solish, yogʻochdan qi...",
1,69148,7cab221310edf5f3c75fc38259bcb7640d080b4b05d5bb...,PHOTO_MISMATCH,думала больше,2023-01-02 05:44:34.432,773695,9bf74458174dd9c039ee6317fd48b356e8fc146f23c60b...,23000,,,,NaT,593db514530578cd1b1c5b0986d8fd36543975cfa0d038...,Кошельки,"b'{""ru"":""\\u041a\\u043e\\u0448\\u0435\\u043b\\..."
2,69154,728611508a21a9214f2c8cc076d21e30046ec5c59bf359...,DEFECTED,брак,2023-01-02 05:45:31.277,695067,0a185871d03ee346b71b657d3fbaebfc35823fec2861f7...,390000,,,,NaT,27a827d1bd879e7e131791bc9b7e8df227a9082fac9f2b...,Аксессуары для маникюра и педикюра,"b'{""ru"":""\\u041c\\u043d\\u043e\\u0433\\u043e\\..."
3,69161,f4f4031321f9b7cf1175fc6d363769297334ddd76aa2eb...,WRONG_ITEM,не тот товар,2023-01-02 05:57:35.652,635687,1123ce2b71eb64c572e6de0e14a723c17a55f67748327d...,71000,,,,NaT,e56fd4d103751c5d8ce2b26297eee56752470b543b6624...,Лонгсливы,"b'{""ru"":""\\u043b\\u043e\\u043d\\u0433\\u0441\\..."
4,69165,c8f3f349fa927aae5fc954c5268b578b9916a908bb1f8a...,DEFECTED,не включается и не работает,2023-01-02 05:59:34.447,764986,95cd02979c6f97dc58590f7c0e6c421d7c3db2b9e7212b...,84000,,,,NaT,93f606bd517c88c296b17fe207ea50ce5019e0daa47ab2...,Машинки для удаления катышек,"b'{""ru"":""\\u041c\\u0430\\u0448\\u0438\\u043d\\..."


In [6]:
#step 2 Sentiment analysis

In [None]:
#Step 3 text feature extractions: 

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transformers import MarianMTModel, MarianTokenizer

def get_sentiment_score(text):
    """
    Calculate the sentiment score for the given text using VADER.
    If the text is in Russian, translate it to English before performing sentiment analysis.
    """
    if isinstance(text, str):
        # Check if the text is in Russian
        if any(char in 'абвгдеёжзийклмнопрстуфхцчшщъыьэюя' for char in text.lower()):
            # Translate Russian text to English using the transformers library
            model_name = 'Helsinki-NLP/opus-mt-ru-en'
            tokenizer = MarianTokenizer.from_pretrained(model_name)
            model = MarianMTModel.from_pretrained(model_name)
            
            input_ids = tokenizer.encode(text, return_tensors='pt')
            output_ids = model.generate(input_ids, max_length=512, num_beams=4, early_stopping=True)[0]
            en_text = tokenizer.decode(output_ids, skip_special_tokens=True)
        else:
            en_text = text

        analyzer = SentimentIntensityAnalyzer()
        sentiment_dict = analyzer.polarity_scores(en_text)
        return sentiment_dict['compound']
    else:
        return 0  # or any other default value you want to use for non-string inputs

# Feature engineering
df['review_length'] = df['review_text'].str.len()
df['sentiment_score'] = df['review_text'].apply(get_sentiment_score)


In [None]:
from transformers import BertTokenizer, BertModel
import torch

# Load the pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

class BERTEmbeddingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, tokenizer, model):
        self.tokenizer = tokenizer
        self.model = model

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        bert_embeddings = []
        for text in X['review_text']:
            if isinstance(text, str):
                # Tokenize the text
                input_ids = torch.tensor([self.tokenizer.encode(text, add_special_tokens=True, max_length=512, truncation=True)])
                
                # Get the BERT embeddings
                with torch.no_grad():
                    outputs = self.model(input_ids)
                    embeddings = outputs[0][0, 0, :].numpy()
                bert_embeddings.append(embeddings)
            else:
                bert_embeddings.append(np.zeros(768))  # Return a default vector for non-string inputs
        return np.array(bert_embeddings)

# Create the BERT embedding transformer
bert_transformer = BERTEmbeddingTransformer(tokenizer, model)


In [None]:
#step 4 Model selection and training 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier

# Split the data into training and validation sets
X_num = df[['review_length', 'sentiment_score', 'purchase_price', 'rating']]
X_text = df['review_text']
X_bert = bert_transformer.transform(df)
X = np.hstack((X_num, X_text, X_bert))
y = df['cause'].astype(str)  # Convert labels to strings
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the multi-class classification model
model = HistGradientBoostingClassifier(random_state=42)
model.fit(X_train, y_train)


In [None]:
#step 5 Evaluating the model 

In [None]:
from sklearn.metrics import f1_score

# Evaluate the model's performance on the validation set
y_val_pred = model.predict_proba(X_val)
f1 = f1_score(y_val, y_val_pred, average='macro')
print(f'Validation F1-score: {f1:.4f}')

# Tune the model's hyperparameters
from sklearn.model_selection import GridSearchCV

param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'n_estimators': [100, 200, 300]
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1_macro', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Evaluate the tuned model
y_val_pred = grid_search.predict_proba(X_val)
f1 = f1_score(y_val, y_val_pred, average='macro')
print(f'Tuned model validation F1-score: {f1:.4f}')

# Use the tuned model for further steps
model = grid_search.best_estimator_


In [None]:
#step 6 Prediction on Test set

In [None]:
# Make predictions on the test set
X_test_num = df_test[['review_length', 'sentiment_score', 'purchase_price', 'rating']]
X_test_text = df_test['review_text']
X_test_bert = bert_transformer.transform(df_test)
X_test = np.hstack((X_test_num, X_test_text, X_test_bert))
y_test_pred = model.predict_proba(X_test)


In [None]:
#step 7 Output Generation 

In [None]:
# Create the output file
output_df = pd.DataFrame({
    'product_id': df_test['product_id'],
    'order_item_id': df_test['order_item_id'],
    'prob_return_reason_DEFECTED': y_test_pred[:, 0],
    'prob_return_reason_WRONG_ITEM': y_test_pred[:, 1],
    'prob_return_reason_BAD_QUALITY': y_test_pred[:, 2],
    'prob_return_reason_PHOTO_MISMATCH': y_test_pred[:, 3],
    'prob_return_reason_WRONG_SIZE': y_test_pred[:, 4]
})

# Save the output file in parquet format
output_df.to_parquet('result.parquet', index=False)
