In [None]:
!pip install pyvi
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
from pyvi import ViTokenizer
import gensim

In [None]:
def clean_text(text):
  
  text=gensim.utils.simple_preprocess(text)
  text=' '.join(text)
  return ViTokenizer.tokenize(text)
def pre_process(list_text):
  return [clean_text(text) for text in list_text]


In [None]:
train_df=pd.read_csv('/content/drive/My Drive/data/train.csv',sep='\t')
Text_train=train_df.text.values
Label_train=train_df.label.values
Text_train=pre_process(Text_train)

In [None]:
num_words = [len(text) for text in Text_train]

import matplotlib.pyplot as plt
%matplotlib inline
plt.hist(num_words, 100)
plt.xlabel('Số từ trong câu')
plt.ylabel('Tần số')
plt.axis([0, 600, 0, 5000])
plt.show()

In [None]:
sentences=[[word for word in text.split(" ")] for text in Text_train]
sentences.append(['UNK','PAD'])

word_model=gensim.models.Word2Vec(sentences=sentences,size=200,min_count=1,window=5)
print(word_model.wv.syn0)

print(word_model.wv.syn0.shape)

print(word_model.wv.vocab)

In [None]:
from gensim.models import KeyedVectors
import torch
from keras.preprocessing.sequence import pad_sequences
from torch.nn.functional import softmax

In [None]:
word_model=KeyedVectors.load_word2vec_format('/content/drive/My Drive/data/word_vector')

In [None]:
EMBBEDING_DIM=word_model.wv.syn0.shape[1]
VOCAB_SIZE=word_model.wv.syn0.shape[0]
MAX_LENGTH=200

In [None]:
# class Sentiment_Analysic(torch.nn.Module):
#   def __init__(self,vocab_size,embedding_dim,num_labels):
#     super().__init__()
#     self.embedding=torch.nn.Embedding(num_embeddings=vocab_size,embedding_dim=embedding_dim)
#     self.bilstm=torch.nn.LSTM(bidirectional=True,input_size=embedding_dim,hidden_size=512,num_layers=2,batch_first=True)
#     # self.dropout=torch.nn.Dropout(0.5)
#     self.linear1=torch.nn.Linear(512,128)
#     self.linear2=torch.nn.Linear(128,num_labels)
#   def forward(self,input):
#     output=self.embedding(input)
#     output,(hidden,cell)=self.bilstm(output)
    
#     output=self.linear1(hidden[-1])
#     output=self.linear2(output)
#     # output=self.dropout(output)
#     return softmax(output,dim=-1)


In [None]:
# model=Sentiment_Analysic(vocab_size=VOCAB_SIZE,embedding_dim=EMBBEDING_DIM,num_labels=2)
# # model.embedding.weight.data.copy_(torch.tensor(word_model.wv.syn0))
# model=torch.load('/content/drive/My Drive/data/Sentiment_Analysis_model.pt')

In [None]:
X_train=[[word for word in sent.split(' ')] for sent in Text_train]

In [None]:
vocab=word_model.wv.vocab

In [None]:
from sklearn.model_selection import train_test_split
X_train=pad_sequences([[vocab.get(word).index for word in sent] for sent in X_train],value=vocab.get('PAD').index,maxlen=MAX_LENGTH,truncating='post',padding='post',dtype='long')

X_train,X_val,y_train,y_val=train_test_split(X_train,Label_train,test_size=0.1,random_state=42)

In [None]:
import torch.nn as nn
from torch.nn import functional as F
class Cnn_Sentiment_Analysis(torch.nn.Module):
    def __init__(self, vocab_size,embedding_dim ,window_sizes=(3, 4, 5)):
        super(Cnn_Sentiment_Analysis, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        self.convs = nn.ModuleList([
            nn.Conv2d(1, 128, [window_size, embedding_dim], padding=(window_size - 1, 0))
            for window_size in window_sizes
        ])

        self.fc = nn.Linear(128 * len(window_sizes), 2)

    def forward(self, x):
        x = self.embedding(x)           # [B, T, E]

        # Apply a convolution + max pool layer for each window size
        x = torch.unsqueeze(x, 1)       # [B, C, T, E] Add a channel dim.
        xs = []
        for conv in self.convs:
            x2 = F.relu(conv(x))        # [B, F, T, 1]
            x2 = torch.squeeze(x2, -1)  # [B, F, T]
            x2 = F.max_pool1d(x2, x2.size(2))  # [B, F, 1]
            xs.append(x2)
        x = torch.cat(xs, 2)            # [B, F, window]

        # FC
        x = x.view(x.size(0), -1)       # [B, F * window]
        logits = self.fc(x)             # [B, class]

        # Prediction
        probs = F.softmax(logits,dim=-1)       # [B, class]
        

        return probs

In [None]:
model=Cnn_Sentiment_Analysis(vocab_size=VOCAB_SIZE,embedding_dim=EMBBEDING_DIM)
model.embedding.weight.data.copy_(torch.tensor(word_model.wv.syn0))

In [None]:
from torch.optim import SGD
optimizer=SGD(model.parameters(),lr=5e-2)
criterion=torch.nn.CrossEntropyLoss()
model.cuda()

In [None]:
from torch.utils.data import DataLoader,TensorDataset,RandomSampler

In [None]:
X_train=torch.tensor(X_train)
y_train=torch.tensor(y_train)
train_data=TensorDataset(X_train,y_train)
train_sample=RandomSampler(train_data)
train_dataloader=DataLoader(train_data,sampler=train_sample,batch_size=64)

X_val=torch.tensor(X_val)
y_val=torch.tensor(y_val)
val_data=TensorDataset(X_val,y_val)
val_sample=RandomSampler(val_data)
val_dataloader=DataLoader(val_data,sampler=val_sample,batch_size=64)

In [None]:
!pip install seqeval
from seqeval.metrics import f1_score,accuracy_score
from torch.nn.utils import clip_grad_norm_
epochs=5
from tqdm import tqdm, trange


In [None]:
tr_loss,val_loss=[],[]

for _ in trange(epochs, desc="Epoch"):
  model.train()
  total_loss = 0
  for step,batch in enumerate(train_dataloader):
      batch = tuple(t.cuda() for t in batch)
      x,y=batch
      model.zero_grad()

      output=model(x)
      loss=criterion(output,y)
      total_loss+=loss.item()
      loss.backward()
      # clip_grad_norm_(parameters=model.parameters(), max_norm=1)
      optimizer.step()
  avg_train_loss = total_loss / len(train_dataloader)
  tr_loss.append(avg_train_loss)
  print('train loss: {}'.format(avg_train_loss))
  model.eval()
  eval_loss=0
  eval_acc=0
  for step,batch in enumerate(val_dataloader):
    batch = tuple(t.cuda() for t in batch)
    x,y=batch
    with torch.no_grad():
      output=model(x)
    loss=criterion(output,y)
    eval_loss+=loss.item()
    output=output.detach().cpu().numpy()
    acc=accuracy_score(output.argmax(axis=1),y.to('cpu').numpy())
    eval_acc+=acc
  avg_val_loss=eval_loss/len(val_dataloader)
  avg_val_acc=eval_acc/len(val_dataloader)
  print('validation loss: {}'.format(avg_val_loss))
  print('validation accuracy : {}'.format(avg_val_acc))
  val_loss.append(avg_val_loss)



      


In [None]:
torch.save(model,'/content/drive/My Drive/data/Cnn_Sentiment_Analysis_model.pt',_use_new_zipfile_serialization=False)

In [None]:
del model

In [None]:
word_model.wv.save_word2vec_format('/content/drive/My Drive/data/word_vector')

In [None]:
df_test=pd.read_csv('/content/drive/My Drive/data/test.csv',sep='\t')
id=df_test['id'].values
Text_test=df_test['text'].values

Text_test=Text_test.astype('str')
Text_test=pre_process(Text_test)

test_sentences = [[word for word in text.split(" ")] for text in Text_test]


X_test=[]
for sentence in test_sentences:
  x=[]
  for word in sentence:
    if word not in vocab:
      x.append(vocab.get('UNK').index)
    else:
      x.append(vocab.get(word).index)
  X_test.append(x)
test_ids = pad_sequences([[word for word in sent] for sent in X_test],
                            value=vocab.get('PAD').index, maxlen=MAX_LENGTH, truncating='post', padding='post',
                            dtype='long')

y_predict=[]

for input_id in test_ids:
  input_id=torch.tensor([input_id],dtype=torch.long)
  with torch.no_grad():
    output=model(input_id.cuda())
  output=output.detach().cpu().numpy()
  y_predict.append(output.argmax(axis=1)[0])
  # print(output.argmin(axis=1)[0])
  

In [None]:
import numpy as np
df_test['label']=np.array(y_predict).astype(np.int)
df_test['id']=id
df_test[["id","label"]].to_csv("/content/drive/My Drive/data/submission.csv",index=False)

In [None]:
print(test_sentences[0])

In [None]:
print(sentence)