In [1]:
import pandas as pd
import numpy as np
import spacy
spacy_eng = spacy.load("en_core_web_sm")

In [2]:
class Vocabulary:
    PAD_token = 0   # Used for padding short sentences
    SOS_token = 1   # Start-of-sentence token
    EOS_token = 2   # End-of-sentence token

    def __init__(self, name):
        self.name = name
        self.word2index = {'PAD': 0, 'SOS': 1, 'EOS': 2, 'UNK':3}
        self.word2count = {}
        self.index2word = {0: "PAD", 1: "SOS", 2: "EOS", 3: "UNK"}
        self.num_words = 4
        self.num_sentences = 0 
        self.longest_sentence = 0
    


    def add_word(self, word):
        
        if word not in self.word2count:
            # First entry of word into vocabulary
            self.word2count[word] = 1
        else:
            if self.word2count[word]== 5:
                self.word2index[word] = self.num_words
                self.index2word[self.num_words] = word
                self.num_words += 1
                self.word2count[word] +=1
            else:
                self.word2count[word] += 1
        
        
    def tokenizer(self,text):
        return [tok.text.lower() for tok in spacy_eng.tokenizer(text)]
    
    def add_sentence(self, sentence):
        sentence_len = 0
        #for word in sentence.split(' '):
        for word in self.tokenizer(sentence):
            sentence_len += 1
            self.add_word(word)
        if sentence_len > self.longest_sentence:
            # This is the longest sentence
            self.longest_sentence = sentence_len
        # Count the number of sentences
        self.num_sentences += 1

    def to_word(self, index):
        return self.index2word[index]

    def to_index(self, word):
        return self.word2index[word]

vocab = Vocabulary('test')

In [3]:
df = pd.read_csv('../input/dataset/train.csv')
test_df = pd.read_csv('../input/dataset/test.csv')
df.head()

Unnamed: 0,content,title,uid,target_ind
0,Premium quality five pocket jean from Wrangler...,Amazon.com: Wrangler Men's Rugged Wear Relaxed...,B0000CBALT,247
1,If you're looking for a different kind of anim...,Sakura Diaries - Complete Series Collector's E...,B00005QFDT,453
2,"First things first: Yes, Thinking XXX features...",Thinking XXX (Extended Cut) (2006),B000BNXD50,228
3,Feathertouch. 100% Polyester Machine Wash Warm...,Amazon.com: Petite Feathertouch Pull-On Pant: ...,B0002LK9V2,223
4,"When you need outstanding fuel delivery, easy ...",ACDelco EP386 Fuel Pump,B000C9PA54,312


In [4]:
test_df.head()

Unnamed: 0,content,title,uid
0,Socket Cap Screws are reliable and durable and...,"Alloy Steel Socket Cap Screw, Flat Head, Hex S...",0
1,"This O-ring is made of black Buna-nitrile, is ...","-161 Buna O-Ring, 70A Durometer, Black, 5-1/2&...",1
2,"This Viton O-ring is black in color, round in ...","-445 Viton O-Ring, 75A Durometer, Black, 8&#03...",2
3,This 5-ounce lightweight pique knit sport shir...,Amazon.com: Port Authority Silk Touch Sport Sh...,3
4,"This O-ring is made of black Buna-nitrile, is ...","-359 Buna O-Ring, 70A Durometer, Black, 5-3/4&...",4


In [5]:
m1 = min(df['target_ind'])
m2 = max(df['target_ind'])
print (f'min label: {m1} \nmax label: {m2}')

min label: 0 
max label: 499


There are 500 labels

In [6]:
df['info'] = df['content'] + ' ' + df['title']
test_df['info'] =  test_df['content'] + ' ' + test_df['title']

In [7]:
df['title_words'] = [len(vocab.tokenizer(t)) for t in df['title']]
df['content_words'] = [len(vocab.tokenizer(t)) for t in df['content']]
df['info_words'] = [len(vocab.tokenizer(t)) for t in df['content']]
df[['title_words', 'content_words', 'info_words']].describe()

Unnamed: 0,title_words,content_words,info_words
count,35112.0,35112.0,35112.0
mean,12.958903,173.547163,173.547163
std,9.089399,232.912858,232.912858
min,1.0,1.0,1.0
25%,6.0,48.0,48.0
50%,10.0,103.0,103.0
75%,17.0,240.0,240.0
max,57.0,10331.0,10331.0


We'll remove some outliers from the data ( products which have very high number of words in either title or content )

In [8]:
mean = df['info_words'].describe()['mean']
std = df['info_words'].describe()['std']
u = mean + 2*std
df1 = df[df['info_words'] <= u]
len(df1)/len(df)

0.9702950558213717

In [10]:
df1[['title_words', 'content_words', 'info_words']].describe()

Unnamed: 0,title_words,content_words,info_words
count,34069.0,34069.0,34069.0
mean,13.052834,144.941707,144.941707
std,9.115579,124.977805,124.977805
min,1.0,1.0,1.0
25%,6.0,46.0,46.0
50%,10.0,96.0,96.0
75%,17.0,235.0,235.0
max,57.0,637.0,637.0


In [11]:
train_df = df1.groupby('target_ind', group_keys=False).apply(lambda x: x.sample((int(0.8*len(x)))))
val_df = df1[~df1.isin(train_df)].dropna()

In [12]:
# adding words to the vocabulary 
vocab_test = Vocabulary('rand')
for sentence in test_df['info']:
    vocab_test.add_sentence(sentence)

for sentence in train_df['info']:
    vocab.add_sentence(sentence)

sent_idxs = []
captions = []
for idx, sentence in enumerate (train_df['info']):
    i = 0
    for word in vocab.tokenizer(sentence):
        if word not in vocab.word2index:
            sent_idxs.append(3)
        else:
            sent_idxs.append(vocab.to_index(word))
        i+=1
    while i < ((vocab_test.longest_sentence)):
        sent_idxs.append(0)
        i+=1
    captions.append(sent_idxs)
    sent_idxs = []
    
# converting list of word tokens to numpy array
train_info = np.array(captions)
train_info.shape

(27054, 7534)

In [13]:
# adding words to the vocabulary 
for sentence in val_df['info']:
    vocab.add_sentence(sentence)

sent_idxs = []
captions = []
for idx, sentence in enumerate (val_df['info']):
    i = 0
    for word in vocab.tokenizer(sentence):
        if word not in vocab.word2index:
            sent_idxs.append(3)
        else:
            sent_idxs.append(vocab.to_index(word))
        i+=1
    while i < ((vocab_test.longest_sentence)):
        sent_idxs.append(0)
        i+=1
    captions.append(sent_idxs)
    sent_idxs = []
    
# converting list of word tokens to numpy array
val_info = np.array(captions)
val_info.shape

(7015, 7534)

In [14]:
sent_idxs = []
captions = []
for idx, sentence in enumerate (test_df['info']):
    i = 0
    for word in vocab.tokenizer(sentence):
        if word not in vocab.word2index:
            sent_idxs.append(3)
        else:
            sent_idxs.append(vocab.to_index(word))
        i+=1
    while i < ((vocab_test.longest_sentence)):
        sent_idxs.append(0)
        i+=1
    captions.append(sent_idxs)
    sent_idxs = []
    
# converting list of word tokens to numpy array
test_info = np.array(captions)
test_info.shape

(8106, 7534)

In [18]:
vocab.num_words

23753

In [19]:
vocabs = vocab.word2index.keys()

def load_embeds(root_dir):
    embeddings_index = dict()
    f = open(root_dir)

    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

    f.close()
    return embeddings_index
embeddings_index = load_embeds('../input/glove6b300dtxt/glove.6B.300d.txt')

In [20]:
embeddings_index['the']

array([ 4.6560e-02,  2.1318e-01, -7.4364e-03, -4.5854e-01, -3.5639e-02,
        2.3643e-01, -2.8836e-01,  2.1521e-01, -1.3486e-01, -1.6413e+00,
       -2.6091e-01,  3.2434e-02,  5.6621e-02, -4.3296e-02, -2.1672e-02,
        2.2476e-01, -7.5129e-02, -6.7018e-02, -1.4247e-01,  3.8825e-02,
       -1.8951e-01,  2.9977e-01,  3.9305e-01,  1.7887e-01, -1.7343e-01,
       -2.1178e-01,  2.3617e-01, -6.3681e-02, -4.2318e-01, -1.1661e-01,
        9.3754e-02,  1.7296e-01, -3.3073e-01,  4.9112e-01, -6.8995e-01,
       -9.2462e-02,  2.4742e-01, -1.7991e-01,  9.7908e-02,  8.3118e-02,
        1.5299e-01, -2.7276e-01, -3.8934e-02,  5.4453e-01,  5.3737e-01,
        2.9105e-01, -7.3514e-03,  4.7880e-02, -4.0760e-01, -2.6759e-02,
        1.7919e-01,  1.0977e-02, -1.0963e-01, -2.6395e-01,  7.3990e-02,
        2.6236e-01, -1.5080e-01,  3.4623e-01,  2.5758e-01,  1.1971e-01,
       -3.7135e-02, -7.1593e-02,  4.3898e-01, -4.0764e-02,  1.6425e-02,
       -4.4640e-01,  1.7197e-01,  4.6246e-02,  5.8639e-02,  4.14

In [21]:
import torch
def load_embed_weights(embeddings_index, embed_dim, vocab, vocab_size):
    matrix_len = vocab_size
    weights_matrix = np.zeros((matrix_len, embed_dim))
    words_found = 0

    for i, word in enumerate(vocab):
        try: 
            weights_matrix[i] = embeddings_index[word]
            words_found += 1
        except KeyError:
            weights_matrix[i] = np.random.normal(scale=0.6, size=(embed_dim, ))
    weights_matrix = torch.tensor(weights_matrix)
    return weights_matrix
weights_matrix = load_embed_weights(embeddings_index, 300, vocabs, vocab.num_words)
weights_matrix.shape

torch.Size([23753, 300])

In [22]:
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms
from glob import glob
from PIL import Image
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

class Dataset(Dataset):
    
    def __init__(self, train_data,info, test = False):
        self.info = info
        self.train_data = train_data
        self.t = test
    
    def __getitem__(self, index):
        if self.t:
            return self.info[index]
        else:
            return self.info[index], self.train_data['target_ind'].iloc[index]
    
    def __len__(self):
        return len(self.train_data)


trainset = Dataset(train_df, train_info)
val_df['target_ind'] = [int(v) for v in (val_df['target_ind'])]
valset = Dataset(val_df, val_info)
testset = Dataset(test_df, test_info, test = True)

In [24]:
batch_size = 32

trainloader = DataLoader(dataset=trainset, batch_size=batch_size, shuffle = True)
valloader = DataLoader(dataset=valset, batch_size=batch_size, shuffle = True)
testloader = DataLoader(dataset=testset, batch_size=batch_size, shuffle = False)

In [25]:
for c in testloader:
    for i in c[0]:
        print(vocab.to_word(int(i)), end = ' ')
    
    break

socket cap screws are reliable and durable and are used in many applications .   they are available in a variety of head styles and materials . alloy steel is steel that has been alloyed with other materials to improve overall physical properties .   steels are designated by a 4 number sae steel grade .   the first two digits indicate the primary materials used to form the steel .   the last 2 digits identify the percentage of carbon for the alloy ( in hundredths ) .   tensile strength ranges for the alloys typically used in these fasteners range from 170,000 to 180,000 psi ( pounds per square inch ) , making these amongst the strongest of materials .   flat head fasteners are designed to fit flush to the surface when used with countersunk holes .   length is measured from the top of the head .   hex socket drive systems are driven by hex wrenches or power tools with hexagonal bits .   a threaded fastener 's size name includes information about the major external diameter , followed by

In [26]:
class LSTM(nn.Module):
    
    def __init__(self, embed_size, hidden_size,  vocab_size, num_labels ):
        super().__init__()
        
        self.embed_size = embed_size
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_labels = num_labels
        
        self.embed = nn.Embedding(num_embeddings = self.vocab_size, embedding_dim = self.embed_size)
        self.embed.weight.requires_grad = False
        self.embed.load_state_dict({'weight': weights_matrix})

        
      
        self.title_lstm = nn.LSTM(input_size = self.embed_size, hidden_size = self.hidden_size, bidirectional = True, batch_first = True, num_layers = 2)
        self.content_lstm = nn.LSTM(input_size = self.embed_size, hidden_size = self.hidden_size, bidirectional = True, batch_first = True, num_layers = 2)
        
        self.attention = nn.Linear(2*self.hidden_size, 2*self.hidden_size, bias = False)
        
        self.fc = nn.Linear(7532, self.num_labels)
        self.fc1 = nn.Linear(4*hidden_size, self.num_labels)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout()
        
    def forward(self, content, title):
        
        content_embed = self.embed(content)
        title_embed = self.embed(title)
       
        t_rep, _ = (self.title_lstm(title_embed))                      #[64, 31, 400]
        c_rep, _ = (self.content_lstm(content_embed))                      #[64,637, 400]
        title_rep = self.relu(t_rep)
        content_rep = self.relu(c_rep)
        

        t = torch.mean(title_rep, dim = 1)
        c = torch.mean(content_rep, dim = 1)
        final_rep = torch.cat((t,c), dim = 1)
        return F.log_softmax((self.fc1(final_rep)), dim = 1)'''
    

"class Model(nn.Module):\n    \n    def __init__(self, embed_size, hidden_size,  vocab_size, num_labels ):\n        super().__init__()\n        \n        self.embed_size = embed_size\n        self.vocab_size = vocab_size\n        self.hidden_size = hidden_size\n        self.num_labels = num_labels\n        \n        self.embed = nn.Embedding(num_embeddings = self.vocab_size, embedding_dim = self.embed_size)\n        self.embed.weight.requires_grad = False\n        self.embed.load_state_dict({'weight': weights_matrix})\n\n        \n      \n        self.title_lstm = nn.LSTM(input_size = self.embed_size, hidden_size = self.hidden_size, bidirectional = True, batch_first = True, num_layers = 2)\n        self.content_lstm = nn.LSTM(input_size = self.embed_size, hidden_size = self.hidden_size, bidirectional = True, batch_first = True, num_layers = 2)\n        \n        self.attention = nn.Linear(2*self.hidden_size, 2*self.hidden_size, bias = False)\n        \n        self.fc = nn.Linear(7532,

In [None]:
class KimCNN(nn.Module):
    def __init__(self, input_dim, embed_dim, n_filters, filter_sizes, output_dim):
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embed_dim)
        self.embedding.load_state_dict({'weight': weights_matrix})
        self.embedding.weight.requires_grad = False
        
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, embedding_dim)) 
                                    for fs in filter_sizes
                                    ])

        self.fc = nn.Linear(n_filters*len(filter_sizes), output_dim)
        
    
    def forward(self,x):
        #x = x.permute(1,0)
        x = self.embedding(x)
        x = x.unsqueeze(1)
        convs_x = [F.relu(conv(x)).squeeze(3) for conv in self.convs]
        pooled_x = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in convs_x]
        cat_x = torch.cat(pooled_x, dim = 1)
        x = self.fc(cat_x)
        return x

In [27]:
class Model(nn.Module):
    
    def __init__(self, embed_size, hidden_size,  vocab_size, num_labels ):
        super().__init__()
        
        self.embed_size = embed_size
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_labels = num_labels
        
        self.embed = nn.Embedding(num_embeddings = self.vocab_size, embedding_dim = self.embed_size)
        self.embed.weight.requires_grad = False
        self.embed.load_state_dict({'weight': weights_matrix})

        
      
    
        self.lstm = nn.LSTM(input_size = self.embed_size, hidden_size = self.hidden_size, bidirectional = True, batch_first = True, num_layers = 2)
        
        self.conv = nn.Conv1d(2*self.hidden_size,64, 3)
        self.avg_pool = nn.AvgPool1d(7532)
        self.max_pool = nn.MaxPool1d(7532)
        self.fc = nn.Linear(128, self.num_labels)
        self.relu = nn.ReLU()
        
    def forward(self, info):
        
        info_embed = self.embed(info)
        i_rep, _ = (self.lstm(info_embed))                      #[32,637, 128]
        info_rep = self.relu(i_rep)                             #[32,637,128]
        info_conv = self.conv(info_rep.permute(0,2,1))   #[32,64,671]
        #info_conv = info_conv.permute(0,2,1)             #[32,671,64]      
        info_avg = self.avg_pool(info_conv).squeeze(-1)   #[32,64]
        info_max = self.max_pool(info_conv).squeeze(-1)   #[32,64]
        info_cat = torch.cat((info_avg, info_max), dim = 1) #[32,128]
        

        return F.log_softmax(self.fc(info_cat), dim = 1)


In [28]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
embed_size = 300
hidden_size = 64
vocab_size = vocab.num_words
num_labels = 500
model = Model(embed_size, hidden_size, vocab_size, num_labels)
model = model.to(device)

criterion = nn.NLLLoss()
criterion = criterion.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-2)

In [30]:
from sklearn.metrics import accuracy_score as acc
def train(trainloader, model,  criterion, optimizer, epoch):
    train_loss = []
    train_acc = []
    for batch_idx, (c,l) in enumerate(trainloader):
        c = c.to(device)
        #t = t.to(device)
        l = l.to(device)
        preds = model(c)
        optimizer.zero_grad()
        loss = criterion(preds, l)
        train_loss.append(loss.item())
        loss.backward()
        optimizer.step()
        p = torch.argmax(preds, dim = 1)
        train_acc.append(acc(l.cpu(), p.cpu()))
        #if (batch_idx%84 == 0): 
        #    print(f'epoch {epoch+1}({(int(batch_idx*100/len(trainloader)))}%)  train loss : {(np.mean(train_loss))}   train accuracy : {(np.mean(train_acc)*100)}%')
    print(f'\nepoch {epoch+1}  train loss : {(np.mean(train_loss)):.2f}   train accuracy : {(np.mean(train_acc)*100):.2f}%\n')
    return np.mean(train_loss)

In [31]:
def val(valloader, model,  criterion, epoch):
    val_loss = []
    val_acc = []
    for batch_idx, (c,l) in enumerate(valloader):
        c = c.to(device)
        #t = t.to(device)
        l = l.to(device)
        preds = model(c)
        loss = criterion(preds, l)
        val_loss.append(loss.item())
        p = torch.argmax(preds, dim = 1)
        val_acc.append(acc(l.cpu(), p.cpu()))
#        if (batch_idx%84 == 0): 
#            print(f'epoch {epoch+1}({(int(batch_idx*100/len(trainloader)))}%)  train loss : {(np.mean(train_loss))}   train accuracy : {(np.mean(train_acc)*100)}%')
    print(f'epoch {epoch+1}  val loss : {(np.mean(val_loss)):.2f}   val accuracy : {(np.mean(val_acc)*100):.2f}%\n')
    return np.mean(val_acc)

In [32]:
preds = []
def test(valloader, model):
    preds = []
    for batch_idx, (c) in enumerate(valloader):
        c = c.to(device)
        #t = t.to(device)
        predsx = model(c)
        p = torch.argmax(predsx, dim = 1)
        preds.append([int(pr) for pr in p])
    
    last = preds[-1]
    preds = preds[:-1]
    preds = np.array(preds)
    p_arr = preds.reshape(-1)
    for l in last:
        p_arr = np.append(p_arr,l)
    print('done')
    return p_arr

In [36]:
#model.load_state_dict(torch.load('./weights.pth'))
#optimizer = torch.optim.Adam(model.parameters(), lr = 1e-1)
epochs = 30
best_acc = 0
for epoch in range(epochs):
    x = train(trainloader, model, criterion, optimizer, epoch)
    y = val(valloader, model, criterion, epoch)
    if y >= best_acc:
        torch.save(model.state_dict(), 'weights.pth')
        best_acc = y
    if epoch == 10:
        torch.save(model.state_dict(), 'weights_10.pth')
        
    if epoch == 20:
        torch.save(model.state_dict(), 'weights_20.pth')


epoch 1  train loss : 3.86   train accuracy : 19.65%

epoch 1  val loss : 3.05   val accuracy : 26.96%


epoch 2  train loss : 2.68   train accuracy : 30.19%

epoch 2  val loss : 2.50   val accuracy : 32.39%


epoch 3  train loss : 2.21   train accuracy : 35.71%

epoch 3  val loss : 2.21   val accuracy : 35.49%


epoch 4  train loss : 1.99   train accuracy : 38.11%

epoch 4  val loss : 2.09   val accuracy : 37.54%


epoch 5  train loss : 1.84   train accuracy : 40.10%

epoch 5  val loss : 2.03   val accuracy : 38.08%


epoch 6  train loss : 1.74   train accuracy : 42.30%

epoch 6  val loss : 1.98   val accuracy : 38.90%


epoch 7  train loss : 1.66   train accuracy : 43.52%

epoch 7  val loss : 2.01   val accuracy : 38.66%


epoch 8  train loss : 1.61   train accuracy : 44.42%

epoch 8  val loss : 2.00   val accuracy : 39.80%


epoch 9  train loss : 1.56   train accuracy : 45.16%

epoch 9  val loss : 1.96   val accuracy : 39.48%


epoch 10  train loss : 1.52   train accuracy : 46.03%


In [37]:
preds = test(testloader, model)
submission = pd.DataFrame({'uid': [t for t in test_df['uid']], 'target_ind': preds})
submission.to_csv('submission.csv')

model.load_state_dict(torch.load('./weights.pth'))
preds = test(testloader, model)
submission_weights = pd.DataFrame({'uid': [t for t in test_df['uid']], 'target_ind': preds})
submission_weights.to_csv('submission_weights.csv')

model.load_state_dict(torch.load('./weights_10.pth'))
preds = test(testloader, model)
submission_weights = pd.DataFrame({'uid': [t for t in test_df['uid']], 'target_ind': preds})
submission_weights.to_csv('submission_weights_10.csv')

model.load_state_dict(torch.load('./weights_20.pth'))
preds = test(testloader, model)
submission_weights = pd.DataFrame({'uid': [t for t in test_df['uid']], 'target_ind': preds})
submission_weights.to_csv('submission_weights_20.csv')

done
done
done
done
