In [19]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from googletrans import Translator
import pandas as pd
import matplotlib.pyplot as plt
from transformers import pipeline

In [13]:
# Load the provided Excel file to examine its structure and contents
file_path = 'test_task.xlsx'
data = pd.read_excel(file_path)

# Display the first few rows of the dataset to understand its structure
data.head()

Unnamed: 0,GOOD_NAME,UNIT_PRICE_AVR,ADG_CODE_MODE,Labels
0,"Գարեջուր Flensburger ,0,5 լ",1510.0,2203.0,beer
1,Բրինձ Բասմատ Ինդիա 1կգ,1331.935484,1006.0,rice
2,ԼՈԼԻԿ ԿՈՂԵՎԱՀԱՆԱԾ 1ԿԳ,800.0,2002.0,tomatoes
3,Ժելե Ֆինի բանան 100գր,600.0,1704.0,dessert
4,Դոնդող «Քրեյզի Ֆան» բուրգեր 17գ.,199.995,1704.0,dessert


# General data preprocessing

In [14]:
def translate(s):
    """
    Returns the string translated to English
    """
    translator = Translator()
    return (translator.translate(s, dest='en').text)


In [15]:
# Handle missing values.
imputer = SimpleImputer(strategy='mean')
data['UNIT_PRICE_AVR'] = imputer.fit_transform(data[['UNIT_PRICE_AVR']])

# Convert the 'ADG_CODE_MODE' column to int and drop rows with non-integer values
data['ADG_CODE_MODE'] = pd.to_numeric(data['ADG_CODE_MODE'], errors='coerce')
data_cleaned = data.dropna(subset=['ADG_CODE_MODE'])
data_cleaned['ADG_CODE_MODE'] = data_cleaned['ADG_CODE_MODE'].astype(int)

# Adds a new column with the translated product name
data_cleaned['GOOD_NAME_TRANSLATED'] = data_cleaned['GOOD_NAME'].apply(translate)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [16]:
data_cleaned

Unnamed: 0,GOOD_NAME,UNIT_PRICE_AVR,ADG_CODE_MODE,Labels,GOOD_NAME_TRANSLATED
0,"Գարեջուր Flensburger ,0,5 լ",1510.000000,2203,beer,Beer Flensburger: 0.5 l
1,Բրինձ Բասմատ Ինդիա 1կգ,1331.935484,1006,rice,Rice Basmat India 1 kg
2,ԼՈԼԻԿ ԿՈՂԵՎԱՀԱՆԱԾ 1ԿԳ,800.000000,2002,tomatoes,1 kg of tomatoes
3,Ժելե Ֆինի բանան 100գր,600.000000,1704,dessert,Jelly Fin Banana 100 g
4,Դոնդող «Քրեյզի Ֆան» բուրգեր 17գ.,199.995000,1704,dessert,Jelly Fan Pyramids 17 G.
...,...,...,...,...,...
495,0406 - Կաթնաշոռային մթերք «Արարատյան» 200գ,450.000000,406,dairy,"0406 - Cottage cheese products ""Araratyan"" 200 g"
496,Գարեջուր Կլինսկոե 0.5լ Ա/Տ,450.000000,2203,beer,Beer Klostko 0.5 l / t
497,Լոլիկ վարդագույն կգ,1968.526490,702,tomatoes,Tomato pink kg
498,Հավի թևի բդիկ տապակած,380.000000,56,meat,Chicken arm bitter fried


# Model creation and Inference

In [17]:
# Load a pre-trained text classification model
classifier = pipeline('zero-shot-classification', model='facebook/bart-large-mnli')

# Define your candidate labels (excluding 'else')
candidate_labels = ['beer', 'cheese', 'rice', 'meat', 'dairy', 'dessert', 'tomatoes']

# Set a confidence threshold
confidence_threshold = 0.3

# Apply the classifier with the threshold logic
def classify_with_threshold(text):
    for word in text.lower().split():
        if word in candidate_labels:
            return word
    result = classifier(text, candidate_labels)
    top_label = result['labels'][0]
    top_score = result['scores'][0]
    if top_score < confidence_threshold:
        return 'else'
    else:
        return top_label

# Apply the function to each row in 'GOOD_NAME_TRANSLATED'
data_cleaned['Predicted_Labels'] = data_cleaned['GOOD_NAME_TRANSLATED'].apply(classify_with_threshold)

# Save the results
data_cleaned.to_csv('predicted_labels.csv', index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


# Calculate Metrics

In [20]:
accuracy = accuracy_score(data_cleaned['Labels'], data_cleaned['Predicted_Labels'])
conf_matrix = confusion_matrix(data_cleaned['Labels'], data_cleaned['Predicted_Labels'])
class_report = classification_report(data_cleaned['Labels'], data_cleaned['Predicted_Labels'])

print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Accuracy: 0.7469879518072289
Confusion Matrix:
[[64  0  0  0  1  0  0  0]
 [ 0 78  0  0  0  1  0  0]
 [ 3 11 23  0 13  2  0  0]
 [ 1  5 19 50 18  3  0  0]
 [ 8  1  8  3 28  5  0  3]
 [ 1  0  3  0  4  5  1  3]
 [ 0  0  0  0  1  0 75  0]
 [ 0  0  1  5  2  0  0 49]]
Classification Report:
              precision    recall  f1-score   support

        beer       0.83      0.98      0.90        65
      cheese       0.82      0.99      0.90        79
       dairy       0.43      0.44      0.43        52
     dessert       0.86      0.52      0.65        96
        else       0.42      0.50      0.46        56
        meat       0.31      0.29      0.30        17
        rice       0.99      0.99      0.99        76
    tomatoes       0.89      0.86      0.88        57

    accuracy                           0.75       498
   macro avg       0.69      0.70      0.69       498
weighted avg       0.76      0.75      0.74       498

