In [1]:
import os
os.chdir('C:/projects/itmo/text-anal/')

In [2]:
import fasttext
import fasttext.util

In [3]:
fasttext.util.download_model('en', if_exists='ignore') 
ft = fasttext.load_model('cc.en.300.bin')



In [4]:
import pandas as pd
import torch as t
from torch.utils.data import DataLoader, Dataset
import numpy as np
from torchsummary import summary
import math

In [5]:
shape = (300, 40)

def fix_sized(array, needed_size, vector_size): 
    while len(array) < needed_size:
        array.append([0]*vector_size)
    return np.array(array[:needed_size]).T

def vectorize_seq(s):
    return fix_sized([ft.get_word_vector(w) for w in str(s).split(' ')], *shape[::-1])

print(vectorize_seq('raz dvas trus'))
print(vectorize_seq('raz dvas trus').shape)

[[ 4.19391431e-02 -1.30229220e-02  3.06631625e-03 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [-1.75315589e-01 -2.30471697e-03 -1.09591149e-01 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 5.15469089e-02 -1.27656385e-05 -1.42107129e-01 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 ...
 [ 1.79358423e-01  4.50087674e-02  9.15019736e-02 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [-5.13700098e-02 -3.05269305e-02 -7.54913837e-02 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 3.45927663e-02 -1.08351810e-02  1.37498990e-01 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]]
(300, 40)


In [6]:
train = pd.read_csv('data/tweets_train.csv')
test = pd.read_csv('data/tweets_test.csv')
print(train.head())
print(len(train))

       textID                                               text sentiment
0  98e200a8da                              Because I love my job  positive
1  4e2a0dbf92                             I hate having headrush  negative
2  9bfe71fa3f                      blogging..it`s my new passion  positive
3  f358968122  I have to go clothes shopping tomorrow  I hate...  negative
4  f38b1d3dff   Ur going 2 get tired of hearing from me, but ...   neutral
19236


In [7]:
X = train[['text']]
y = train[['sentiment']]
for label in ['positive', 'negative', 'neutral']:
    y[f'sentiment_{label}'] = (y['sentiment'] == label).astype(int)
y.drop(['sentiment'], axis=1, inplace=True)

# y = y.apply
print(X.head())
print(y.head())

                                                text
0                              Because I love my job
1                             I hate having headrush
2                      blogging..it`s my new passion
3  I have to go clothes shopping tomorrow  I hate...
4   Ur going 2 get tired of hearing from me, but ...
   sentiment_positive  sentiment_negative  sentiment_neutral
0                   1                   0                  0
1                   0                   1                  0
2                   1                   0                  0
3                   0                   1                  0
4                   0                   0                  1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y[f'sentiment_{label}'] = (y['sentiment'] == label).astype(int)


In [8]:
print(max(len(item['text'].split(' ')) for item in X.iloc))

38


In [53]:
class MyModel(t.nn.Module):
    def __init__(self, shape):
        super().__init__()
        self.stack = t.nn.Sequential(
            t.nn.Conv1d(shape[0], 200, 1),
            t.nn.ELU(),
            t.nn.Conv1d(200, 100, 1),
            t.nn.ELU(),
            t.nn.Flatten(),
            t.nn.Linear(4000, 300),
            t.nn.ELU(),
            t.nn.Linear(300, 3),
            t.nn.ELU(),
        )
        
    def forward(self, X):
        return self.stack(X)
    
model = MyModel(shape)
summary(model, shape)

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv1d-1              [-1, 200, 40]          60,200
               ELU-2              [-1, 200, 40]               0
            Conv1d-3              [-1, 100, 40]          20,100
               ELU-4              [-1, 100, 40]               0
           Flatten-5                 [-1, 4000]               0
            Linear-6                  [-1, 300]       1,200,300
               ELU-7                  [-1, 300]               0
            Linear-8                    [-1, 3]             903
               ELU-9                    [-1, 3]               0
Total params: 1,281,503
Trainable params: 1,281,503
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.05
Forward/backward pass size (MB): 0.22
Params size (MB): 4.89
Estimated Total Size (MB): 5.15
---------------------------------------

In [9]:
class TextDataset(Dataset): 
    def __init__(self, x, y=None):
        self.x = [vectorize_seq(item['text']).astype(np.float32) for item in x.iloc]
        self.y = y
        self.size = len(x)
        
    def __len__(self):
        return self.size
    
    def __getitem__(self, idx):
        x = t.from_numpy(self.x[idx])
        if self.y is not None:
            y = t.from_numpy(np.array([np.argmax(self.y.iloc[idx].to_numpy())]))
            return (x, y)
        return x

train_dataset = TextDataset(X, y)
print(train_dataset[0])

(tensor([[ 0.0128,  0.1005,  0.1389,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0070, -0.1526,  0.0809,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0022,  0.0444,  0.0961,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [-0.0084,  0.6786,  0.0484,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0083, -0.1639, -0.0927,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0347,  0.0854,  0.0024,  ...,  0.0000,  0.0000,  0.0000]]), tensor([0]))


In [54]:
model.train()

# loss_function = t.nn.NLLLoss()
loss_function = t.nn.CrossEntropyLoss()
optimizer = t.optim.Adam(model.parameters(), lr=0.001)
max_epochs = 30
batch_size = 400
_train = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
min_loss = 300

m_path = 'data/torch_model_state.bin'
for i in range(0, max_epochs):
    for batch, (b_X, b_Y) in enumerate(_train):
        optimizer.zero_grad()
        b_ans = model.forward(b_X)
        loss = loss_function(b_ans, b_Y.flatten())
        loss.backward()
        optimizer.step()
        
        if batch == 0:
            loss, current = loss.item(), batch
            print(f"epoch: {i} loss: {loss:>7f}  {current:>5d}")
            if loss < min_loss:
                t.save(model.state_dict(), m_path)
                min_loss = loss
                print('model saved')
print('best loss:', min_loss)
model.load_state_dict(t.load(m_path))
model.eval()

epoch: 0 loss: 1.102338      0
model saved
epoch: 1 loss: 0.849264      0
model saved
epoch: 2 loss: 0.819758      0
model saved
epoch: 3 loss: 0.804996      0
model saved
epoch: 4 loss: 0.786503      0
model saved
epoch: 5 loss: 0.702900      0
model saved
epoch: 6 loss: 0.770754      0
epoch: 7 loss: 0.766972      0
epoch: 8 loss: 0.711971      0
epoch: 9 loss: 0.708953      0
epoch: 10 loss: 0.687942      0
model saved
epoch: 11 loss: 0.609004      0
model saved
epoch: 12 loss: 0.573422      0
model saved
epoch: 13 loss: 0.592827      0
epoch: 14 loss: 0.556821      0
model saved
epoch: 15 loss: 0.562782      0
epoch: 16 loss: 0.591676      0
epoch: 17 loss: 0.515409      0
model saved
epoch: 18 loss: 0.479677      0
model saved
epoch: 19 loss: 0.502862      0
epoch: 20 loss: 0.465388      0
model saved
epoch: 21 loss: 0.490328      0
epoch: 22 loss: 0.413608      0
model saved
epoch: 23 loss: 0.423320      0
epoch: 24 loss: 0.395913      0
model saved
epoch: 25 loss: 0.407017      

MyModel(
  (stack): Sequential(
    (0): Conv1d(300, 200, kernel_size=(1,), stride=(1,))
    (1): ELU(alpha=1.0)
    (2): Conv1d(200, 100, kernel_size=(1,), stride=(1,))
    (3): ELU(alpha=1.0)
    (4): Flatten(start_dim=1, end_dim=-1)
    (5): Linear(in_features=4000, out_features=300, bias=True)
    (6): ELU(alpha=1.0)
    (7): Linear(in_features=300, out_features=3, bias=True)
    (8): ELU(alpha=1.0)
  )
)

In [20]:
x_test = TextDataset(test[['text']])

In [None]:
print(x_test[0])

In [55]:
_data = DataLoader(x_test)
result=[]
for x in _data:
    result.append(model(x).detach().numpy())

In [56]:
result = np.array(result)

In [57]:
res = np.argmax(result, axis=2)

In [58]:
mp = {
    0: 'positive',
    1: 'negative',
    2: 'neutral'
}

In [59]:
pd.DataFrame({
    'textID': test['textID'],
    'sentiment': [mp[item] for item in res.astype(int).reshape(-1)]
}).to_csv('data/submissions/4-neural-network-13.csv', index=False)