# Clean Data

In [None]:
import pandas as pd
import numpy as np
import csv

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
# clean function
import re
import nltk.corpus
import nltk 
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize

def clean_text(df, text_field):
    df[text_field] = df[text_field].str.lower()
    df[text_field] = df[text_field].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", str(elem)))  
    # remove numbers
    df[text_field] = df[text_field].apply(lambda elem: re.sub(r"\d+", "", str(elem)))  
    # stop words
    stop = stopwords.words('english')
    df[text_field] = df[text_field].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
    # tokens
    df[text_field] =  df[text_field].apply(lambda x: word_tokenize(x))
    # lemmatization
    def word_lemmatizer(text):
      lem_text = [WordNetLemmatizer().lemmatize(i) for i in text]
      return lem_text
    df[text_field] = df[text_field].apply(lambda x: word_lemmatizer(x))
    df[text_field] = df[text_field].apply(lambda x: ' '.join(x))
    df[text_field] = df[text_field].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "",  str(elem)))  
    return df

In [None]:
train = clean_text(train, 'TITLE')
train = clean_text(train, 'DESCRIPTION')
train = clean_text(train, 'BULLET_POINTS')
train = clean_text(train, 'BRAND')

test = clean_text(test, 'TITLE')
test = clean_text(test, 'DESCRIPTION')
test = clean_text(test, 'BULLET_POINTS')
test = clean_text(test, 'BRAND')

train['text'] = train['TITLE'] + " " + train['DESCRIPTION'] + " " + train['BULLET_POINTS'] + " " + train['BRAND']
test['text'] = test['TITLE'] + " " + test['DESCRIPTION'] + " " + test['BULLET_POINTS'] + " " + test['BRAND']

# SVM

In [None]:
import pandas as pd
import numpy as np

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])

In [None]:
train.fillna(' ',inplace =True )
test.fillna(' ',inplace =True )

In [None]:
# from tqdm.notebook import tqdm
submission = pd.DataFrame()
submission['PRODUCT_ID'] = [i for i in range(1:len(test)+1)]
submission['BROWSE_NODE_ID'] = y_pred

In [None]:
sgd.fit(train['TITLE'], train['BROWSE_NODE_ID'])

In [None]:
y_pred = sgd.predict(test['TITLE'])

In [None]:
# from tqdm.notebook import tqdm
submission = pd.DataFrame()
submission['PRODUCT_ID'] = [i for i in range(1,len(test)+1)]
submission['BROWSE_NODE_ID'] = y_pred

In [None]:
submission.to_csv('tempsvc.csv',index=False)

# FastText Library

In [None]:
!git clone https://github.com/facebookresearch/fastText.git
%cd fastText
!sudo pip install .
!sudo python setup.py install
%cd ../

In [None]:
text = 'text2'
label = 'BROWSE_NODE_ID'
train['text2'] = train['TITLE'] + " " + train['DESCRIPTION']
train = train[[text,label]].copy()
train[text].fillna('#', inplace=True)

In [None]:
from tqdm.notebook import tqdm
f = open('train.txt', 'w')
for index, row in tqdm(train.iterrows()):
  f.write('__label__'+ str(row[label]) +' '+row[text]+'\n')
f.close()

In [None]:
import fasttext
model = fasttext.train_supervised(input="train.txt", lr=0.5, epoch=2, 
                                  wordNgrams=2, dim=50, 
                                  loss='hs')

In [None]:
test['text2'] = test['TITLE'] + " " + test['DESCRIPTION']
test = test[[text]].copy()
test[text].fillna('#', inplace=True)

In [None]:
from tqdm.notebook import tqdm
submission = pd.DataFrame()
submission['PRODUCT_ID'] = test['PRODUCT_ID']
submission['BROWSE_NODE_ID'] = 0;
for index, row in tqdm(submission.iterrows()):
  tempText = test.iloc[index][text]
  prediction = model.predict(tempText)
  submission.iloc[index]['BROWSE_NODE_ID'] = int(prediction[0][0][9:])

In [None]:
submission.to_csv('fastTextSubmission.csv', index=False )

# Bert (Transformers Library)


In [None]:
import torch
from transformers import BertTokenizer
from torch.utils.data import TensorDataset
from transformers import BertForSequenceClassification

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
df = train[['TITLE', 'BROWSE_NODE_ID']].copy()
del train

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(df.index.values, 
                                                  df.label.values, 
                                                  test_size=0.15, 
                                                  random_state=42)

In [None]:
df['data_type'] = ['not_set']*df.shape[0]

df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)

In [None]:
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].Title.values, 
    add_special_tokens=True,
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].Title.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].label.values)

In [None]:
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

# Bert (from Tez Library)

In [None]:
%%capture
!pip install tez
!pip install transformers

In [None]:
import torch.nn as nn
import transformers
import torch
import tez
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
# Taken from https://github.com/abhishekkrthakur/tez/blob/main/examples/text_classification/binary.py

class BERTDataset:
    def __init__(self, text, target):
        self.text = text
        self.target = target
        self.tokenizer = transformers.BertTokenizer.from_pretrained(
            "bert-base-uncased", do_lower_case=True
        )
        self.max_len = 64

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        text = str(self.text[item])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]

        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
            "targets": torch.tensor(self.target[item], dtype=torch.long),
        }

class BERTBaseUncased(tez.Model):
    def __init__(self, num_train_steps, num_classes):
        super().__init__()
        self.tokenizer = transformers.BertTokenizer.from_pretrained(
            "bert-base-uncased", do_lower_case=True
        )
        self.bert = transformers.BertModel.from_pretrained("bert-base-uncased",
                                                           return_dict=False)
        self.bert_drop = nn.Dropout(0.3)
        self.out = nn.Linear(768, num_classes)

        self.num_train_steps = num_train_steps
        self.step_scheduler_after = "batch"

    def fetch_optimizer(self):
        param_optimizer = list(self.named_parameters())
        no_decay = ["bias", "LayerNorm.bias"]
        optimizer_parameters = [
            {
                "params": [
                    p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.001,
            },
            {
                "params": [
                    p for n, p in param_optimizer if any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.0,
            },
        ]
        opt = AdamW(optimizer_parameters, lr=3e-5)
        return opt

    def fetch_scheduler(self):
        sch = get_linear_schedule_with_warmup(
            self.optimizer, num_warmup_steps=0, num_training_steps=self.num_train_steps
        )
        return sch

    def loss(self, outputs, targets):
        if targets is None:
            return None
        return nn.CrossEntropyLoss()(outputs, targets)

    def monitor_metrics(self, outputs, targets):
        if targets is None:
            return {}
        outputs = torch.argmax(outputs, dim=1).cpu().detach().numpy()
        targets = targets.cpu().detach().numpy()
        accuracy = metrics.accuracy_score(targets, outputs)
        return {"accuracy": accuracy}

    def forward(self, ids, mask, token_type_ids, targets=None):
        _, o_2 = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids)
        b_o = self.bert_drop(o_2)
        output = self.out(b_o)
        loss = self.loss(output, targets)
        acc = self.monitor_metrics(output, targets)
        return output, loss, acc

In [None]:
import pandas as pd
import csv
import numpy as np

In [None]:
train = pd.read_csv('train.csv')[['TITLE','BROWSE_NODE_ID']]
train.fillna('', inplace=True)
test = pd.read_csv('test.csv')
test.fillna('', inplace=True)

In [None]:
encoder = LabelEncoder()
train.BROWSE_NODE_ID = encoder.fit_transform(train.BROWSE_NODE_ID)

In [None]:
train, val = train_test_split(train, test_size=0.1, random_state=23)

In [None]:
train_dataset = BERTDataset(
    train.TITLE.values,
    train.BROWSE_NODE_ID.values
)
valid_dataset = BERTDataset(
    val.TITLE.values,
    val.BROWSE_NODE_ID.values
)

In [None]:
batch = 32
n_train_steps = int(len(train) / batch * 2)
model = BERTBaseUncased(num_train_steps=n_train_steps, num_classes=len(encoder.classes_))

In [None]:
%%capture
model.fit(
        train_dataset,
        valid_dataset=valid_dataset,
        train_bs=batch,
        device="cuda",
        epochs=2,
        n_jobs = 2,
        fp16=True,
    )

### Predictions

In [None]:
%%capture
test_dataset = BERTDataset(
        test['TITLE'].values,
        [0]*len(test)
    )
predictionGenerator = model.predict(test_dataset, batch_size=batch, n_jobs=-1)
predictions = []
for probs in predictionGenerator:
    predictions.extend(np.argmax(probs, axis=1))
predictions = encoder.inverse_transform(predictions)
submission = pd.DataFrame()
submission['PRODUCT_ID'] = [i for i in range(1,len(test)+1)]
submission['BROWSE_NODE_ID'] = predictions
save_file.to_csv('bert.csv',index=False)