<a href="https://colab.research.google.com/github/astromad/MyDeepLearningRepo/blob/master/ProductClassification_Hackathon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!rm -rf Classification_cache
!rm -rf results_PT
!rm -rf logs_PT


In [5]:
!pip install transformers



In [6]:
# Read the Dataset and load in Pandas Dataframe
import pandas as pd
df = pd.read_csv("/content/drive/My Drive/ColabData/Amazon.csv",
                encoding="ISO-8859-1", error_bad_lines=False)

data = df[['category', 'label_title', 'label_description']]
data.dropna(subset=['category'], inplace=True)
print(data.head(3))


                category  ...                                  label_description
0  Headphone Accessories  ...  The pocket-size Koss 3-Band Equalizer delivers...
1     Inkjet Printer Ink  ...  Kodak Black Ink Cartridge 10B is a standard bl...
2  Computers Accessories  ...  1GB - 333MHz DDR333 PC2700 - DDR SDRAM - 184-p...

[3 rows x 3 columns]


In [7]:
# Remove rows if category is null
data.dropna(subset=['category'], inplace=True)

In [8]:
print(data.groupby('category').count() )
# df = data[data.groupby('encoded_category').count() > 20]
# print(df.groupby('encoded_category').count() > 20)
value_counts = data['category'].value_counts()
#print('value counts',value_counts)
to_remove = value_counts[value_counts <= 20].index
#print('to be removed',to_remove)
data = data[~data.category.isin(to_remove)]
print(data.groupby('category').count() )

                           label_title  label_description
category                                                 
12V                                  1                  1
6V                                   4                  4
9V                                   6                  6
A                                    2                  2
AA                                  22                 22
...                                ...                ...
Wires                                1                  1
Wiring Harnesses                    20                 20
Wrist Rests                         17                 17
eBook Readers                       12                 12
eBook Readers Accessories            6                  6

[706 rows x 2 columns]
                        label_title  label_description
category                                              
AA                               22                 22
AC Adapters                      38                 38
Ac

In [9]:
# Convert category description to numerical category ID
encode_dict={}
def encode_label(x):
    if x not in encode_dict.keys():
        encode_dict[x]=len(encode_dict)
    return encode_dict[x]

data['encoded_category'] = data['category'].apply(lambda x: encode_label(x))

In [10]:
# create new dataframe and merge label title and description
newData=pd.DataFrame()
newData['desc']=data['label_title'] +' '+ data['label_description'] 
newData['encoded_category']=data['encoded_category']
#newData['category']=data['category']


In [11]:
print(newData[:21])
newData = newData.reset_index(drop=True)
print(newData[:21])

                                                 desc  encoded_category
0   Koss EQ50 3-Band Stereo Equalizer The pocket-s...                 0
1   Kodak Black Ink Cartridge 10B 1163641 Kodak Bl...                 1
2   Kingston 128MX64 PC2700 COMPAQ Evo D320 KTC-D3...                 2
3   Kinamax MS-UES2 Mini High Precision USB 3-Butt...                 3
4   Kensington K72349US Wireless Mouse for Netbook...                 3
5   Kensington BlackBelt Protection Band for iPad ...                 4
6   JUST5 J509 Easy to Use Unlocked Cell Phone wit...                 5
7   Imation Corp 50PK CDR 700MB 80MIN 52X-SPINDLE ...                 6
8   16x DVD-R Media Imation 16x DVD-R Media 17340 ...                 7
9   iGo Arctic Laptop Cooling Pad AC05065-0001 Eve...                 8
10  HP TouchPad Custom Fit Case Protect your HP To...                 9
11  HP LaserJet Pro P1606dn Printer CE749A BGJ WHY...                10
12  HP 85A LaserJet Black Toner Print Cartridge - ...           

In [12]:
# drop any rows with description is null
newData.dropna(subset=['desc'], inplace=True)
nan_rows = newData[newData.isnull().T.any()]
print(nan_rows)

Empty DataFrame
Columns: [desc, encoded_category]
Index: []


In [13]:
newData.loc[20,'desc']

'EDGE SD Gaming Cards - Flash memory card - 1 GB - 130x - SD Edge Tech Corp 1GB Secure Digital SD Gaming Card EDGDM-222666-PE Flash Memory'

In [14]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [15]:
# Preprocessing on description text data, remove stop words, remove spaces, lowercase
# note: we are not lemmatize as Bert will take care of it
newData['desc']=newData.desc.str.replace("[^\w\s]", "").str.lower()
#newData['desc']=newData.desc.str.replace('\d+', '')
#newData['desc']=newData['desc'].apply(lambda x: [item for item in x.split() if item not in stop])
newData['desc']=newData['desc'].apply(lambda x: ' '.join([item for item in x.split() if item not in stop]))

In [16]:
newData.loc[20,'desc']

'edge sd gaming cards flash memory card 1 gb 130x sd edge tech corp 1gb secure digital sd gaming card edgdm222666pe flash memory'

In [17]:
# Helper functions to convert category ID to numerical and back
from future.utils import iteritems
label2idx = {t: i for i, t in enumerate(encode_dict)}
idx2label = {v: k for k, v in iteritems(label2idx)}

In [18]:
#print(newData)

In [19]:
# findout number of categories
ClassMax=newData['encoded_category'].max()
print(ClassMax)


187


In [20]:
#data['encoded_category'].describe()

In [21]:
# Create train and test data split
train_size = 0.8
train_dataset=newData.sample(frac=train_size,random_state=200)
test_dataset=newData.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(newData.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

FULL Dataset: (18046, 2)
TRAIN Dataset: (14437, 2)
TEST Dataset: (3609, 2)


In [22]:
MAX_LEN = 128
LEARNING_RATE = 3e-02

In [23]:
from transformers import (
    AutoConfig,
    AutoTokenizer,
    #BertTokenizer
)
model_args = dict()
model_args['model_name'] = 'bert-base-uncased' 
model_args['cache_dir'] = "Classification_cache/"
model_args['do_basic_tokenize'] = False

config = AutoConfig.from_pretrained(
    model_args['model_name'],
    # num_labels=num_labels,
    # id2label=label_map,
    # label2id={label: i for i, label in enumerate(labels)},
    cache_dir=model_args['cache_dir'],
    return_dict=True,
    num_labels=ClassMax+1
    #num_labels=1
)

tokenizer = AutoTokenizer.from_pretrained(
    model_args['model_name'],
    cache_dir=model_args['cache_dir'],
    is_pretokenized=model_args['do_basic_tokenize'],
    do_basic_tokenize = model_args['do_basic_tokenize']
)

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [24]:
import torch
import re
class TorchClassificationDataset(torch.utils.data.Dataset):
    def __init__(self,dataset,max_len):
        self.len = len(dataset)
        self.data = dataset
        self.max_len=max_len
    def __getitem__(self, idx):
        description = str(self.data.desc[idx])
        #description = " ".join(description.split())
        #print(description)
        description = description[:self.max_len]
        #description = re.sub('[^a-zA-Z0-9\n\.]', ' ', description)
        #description = " ".join(description.split())
        #print(description)
        inputs = tokenizer.encode_plus(
            description,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            #pad_to_max_length=True,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        item ={}
        item['input_ids']=torch.tensor(inputs['input_ids'], dtype=torch.long)
        item['token_type_ids']=torch.tensor(inputs['token_type_ids'], dtype=torch.long)
        item['attention_mask']=torch.tensor(inputs['attention_mask'], dtype=torch.long)
        item['labels'] = torch.tensor(self.data.encoded_category[idx], dtype=torch.long)
        return item

    def __len__(self):
        return self.len

In [25]:
def createDataset(framework='pt'):
  if framework=='pt':
    train_ds = TorchClassificationDataset(train_dataset,MAX_LEN)
    test_ds= TorchClassificationDataset(test_dataset,MAX_LEN)
  return train_ds,test_ds

In [26]:
train_ds,test_ds = createDataset('pt')
print('One record of Training dataset')
print(train_dataset.loc[1,'desc'])
print('----')
print(train_ds[1])


One record of Training dataset
hp new oem 3500 3700 fuser kit q3655a q3655a hp oem 3500 3700 fuser kit hp oem genuine sold 90 day warranty
----
{'input_ids': tensor([  101,  6522,  2047,  1051,  6633,  8698,  2692, 16444,  2692, 19976,
         2099,  8934,  1053, 21619, 24087,  2050,  1053, 21619, 24087,  2050,
         6522,  1051,  6633,  8698,  2692, 16444,  2692, 19976,  2099,  8934,
         6522,  1051,  6633, 10218,  2853,  3938,  2154, 10943,  2100,   102,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0

In [27]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l[K     |███████▌                        | 10 kB 37.8 MB/s eta 0:00:01[K     |███████████████                 | 20 kB 21.1 MB/s eta 0:00:01[K     |██████████████████████▌         | 30 kB 17.2 MB/s eta 0:00:01[K     |██████████████████████████████  | 40 kB 15.5 MB/s eta 0:00:01[K     |████████████████████████████████| 43 kB 1.5 MB/s 
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16181 sha256=402dece5693c1f493e9b00a4c58512caf4d7a373b941ab96376866eab701381b
  Stored in directory: /root/.cache/pip/wheels/05/96/ee/7cac4e74f3b19e3158dce26a20a1c86b3533c43ec72a549fd7
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [28]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [29]:
# from torch import cuda
# device = 'cuda' if cuda.is_available() else 'cpu'

In [30]:
from transformers import (
    AutoModelForSequenceClassification,
    #BertForSequenceClassification,
    Trainer,
    TrainingArguments
)
model = AutoModelForSequenceClassification.from_pretrained(
    model_args['model_name'],
    config=config,
    cache_dir=model_args['cache_dir'],
)
training_args = TrainingArguments(
    output_dir='./results_PT',          
    num_train_epochs=20,              
    per_device_train_batch_size=32,  
    per_device_eval_batch_size=32,   
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir='./logs_PT',            
    logging_steps=3,
    #learning_rate=LEARNING_RATE
)

trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=train_ds,        
    eval_dataset=test_ds,
    compute_metrics=compute_metrics,  
)

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [31]:
#model.to(device)

In [None]:
# Lets tain the model now
trainer.train()

***** Running training *****
  Num examples = 14437
  Num Epochs = 20
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 9040


Step,Training Loss
3,5.2267
6,5.2557
9,5.2511
12,5.2344
15,5.2545
18,5.2689
21,5.26
24,5.3207
27,5.2468
30,5.2334


Saving model checkpoint to ./results_PT/checkpoint-500
Configuration saved in ./results_PT/checkpoint-500/config.json
Model weights saved in ./results_PT/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./results_PT/checkpoint-1000
Configuration saved in ./results_PT/checkpoint-1000/config.json
Model weights saved in ./results_PT/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./results_PT/checkpoint-1500
Configuration saved in ./results_PT/checkpoint-1500/config.json
Model weights saved in ./results_PT/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to ./results_PT/checkpoint-2000
Configuration saved in ./results_PT/checkpoint-2000/config.json
Model weights saved in ./results_PT/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to ./results_PT/checkpoint-2500
Configuration saved in ./results_PT/checkpoint-2500/config.json
Model weights saved in ./results_PT/checkpoint-2500/pytorch_model.bin


Step,Training Loss
3,5.2267
6,5.2557
9,5.2511
12,5.2344
15,5.2545
18,5.2689
21,5.26
24,5.3207
27,5.2468
30,5.2334


Saving model checkpoint to ./results_PT/checkpoint-3000
Configuration saved in ./results_PT/checkpoint-3000/config.json
Model weights saved in ./results_PT/checkpoint-3000/pytorch_model.bin
Saving model checkpoint to ./results_PT/checkpoint-3500
Configuration saved in ./results_PT/checkpoint-3500/config.json
Model weights saved in ./results_PT/checkpoint-3500/pytorch_model.bin
Saving model checkpoint to ./results_PT/checkpoint-4000
Configuration saved in ./results_PT/checkpoint-4000/config.json
Model weights saved in ./results_PT/checkpoint-4000/pytorch_model.bin
Saving model checkpoint to ./results_PT/checkpoint-4500
Configuration saved in ./results_PT/checkpoint-4500/config.json
Model weights saved in ./results_PT/checkpoint-4500/pytorch_model.bin


In [None]:
trainer.evaluate()

In [None]:
predictions, label_ids, metrics = trainer.predict(test_ds)
for key, value in metrics.items():
    print( key, value)

In [None]:
inputs = tokenizer("Da-Lite Stand Master I - Cart for projector Projection Carts - Stand Master I Features The height of both the upper and lower shelves", return_tensors="pt")
print(inputs)
labels = torch.tensor([62]).unsqueeze(0)
print(labels)

In [None]:
model.to('cpu')
outputs = model(**inputs, labels=labels)
print(outputs.loss)
pred=outputs.logits.argmax(-1)
print('prediction=',pred,idx2label[(int)(pred.cpu().detach().numpy())])

In [None]:
print(newData.loc[18165].desc)