In [1]:
import os
os.chdir('C:/projects/itmo/text-anal/')

In [2]:
import fasttext
import fasttext.util

In [3]:
fasttext.util.download_model('en', if_exists='ignore') 
ft = fasttext.load_model('cc.en.300.bin')



In [4]:
import pandas as pd
import torch as t
from torch.utils.data import DataLoader, Dataset
import numpy as np
from torchsummary import summary
import math

In [21]:
shape = (300, 40)

def fix_sized(array, needed_size, vector_size): 
    while len(array) < needed_size:
        array.append([0]*vector_size)
    return np.array(array[:needed_size]).T

def vectorize_seq(s):
    return fix_sized([ft.get_word_vector(w) for w in str(s).split(' ')], *shape[::-1])

print(vectorize_seq('raz dvas trus'))
print(vectorize_seq('raz dvas trus').shape)

[[ 4.19391431e-02 -1.30229220e-02  3.06631625e-03 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [-1.75315589e-01 -2.30471697e-03 -1.09591149e-01 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 5.15469089e-02 -1.27656385e-05 -1.42107129e-01 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 ...
 [ 1.79358423e-01  4.50087674e-02  9.15019736e-02 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [-5.13700098e-02 -3.05269305e-02 -7.54913837e-02 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 3.45927663e-02 -1.08351810e-02  1.37498990e-01 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]]
(300, 40)


In [6]:
train = pd.read_csv('data/tweets_train.csv')
test = pd.read_csv('data/tweets_test.csv')
print(train.head())
print(len(train))

       textID                                               text sentiment
0  98e200a8da                              Because I love my job  positive
1  4e2a0dbf92                             I hate having headrush  negative
2  9bfe71fa3f                      blogging..it`s my new passion  positive
3  f358968122  I have to go clothes shopping tomorrow  I hate...  negative
4  f38b1d3dff   Ur going 2 get tired of hearing from me, but ...   neutral
19236


In [10]:
X = train[['text']]
y = train[['sentiment']]
for label in ['positive', 'negative', 'neutral']:
    y[f'sentiment_{label}'] = (y['sentiment'] == label).astype(int)
y.drop(['sentiment'], axis=1, inplace=True)

# y = y.apply
print(X.head())
print(y.head())

                                                text
0                              Because I love my job
1                             I hate having headrush
2                      blogging..it`s my new passion
3  I have to go clothes shopping tomorrow  I hate...
4   Ur going 2 get tired of hearing from me, but ...
   sentiment_positive  sentiment_negative  sentiment_neutral
0                   1                   0                  0
1                   0                   1                  0
2                   1                   0                  0
3                   0                   1                  0
4                   0                   0                  1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y[f'sentiment_{label}'] = (y['sentiment'] == label).astype(int)


In [11]:
print(max(len(item['text'].split(' ')) for item in X.iloc))

38


In [82]:
class MyModel(t.nn.Module):
    def __init__(self, shape):
        super().__init__()
        self.stack = t.nn.Sequential(
            t.nn.Conv1d(shape[0], 30, 1),
            t.nn.ELU(),
            t.nn.Conv1d(30, shape[1], 3),
            t.nn.MaxPool1d(2),
            t.nn.ELU(),
            t.nn.Conv1d(shape[1], 1, 3),
            t.nn.ELU(),
            t.nn.Flatten(),
            t.nn.Linear(17, 20),
            t.nn.ELU(),
            t.nn.Linear(20, 3),
            t.nn.ELU(),
        )
        
    def forward(self, X):
        return self.stack(X)
    
model = MyModel(shape)
summary(model, shape)

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv1d-1               [-1, 30, 40]           9,030
               ELU-2               [-1, 30, 40]               0
            Conv1d-3               [-1, 40, 38]           3,640
         MaxPool1d-4               [-1, 40, 19]               0
               ELU-5               [-1, 40, 19]               0
            Conv1d-6                [-1, 1, 17]             121
               ELU-7                [-1, 1, 17]               0
           Flatten-8                   [-1, 17]               0
            Linear-9                   [-1, 20]             360
              ELU-10                   [-1, 20]               0
           Linear-11                    [-1, 3]              63
              ELU-12                    [-1, 3]               0
Total params: 13,214
Trainable params: 13,214
Non-trainable params: 0
---------------------------------

In [13]:
class TextDataset(Dataset): 
    def __init__(self, x, y=None):
        self.x = [vectorize_seq(item['text']).astype(np.float32) for item in x.iloc]
        self.y = y
        self.size = len(x)
        
    def __len__(self):
        return self.size
    
    def __getitem__(self, idx):
        x = t.from_numpy(self.x[idx])
        if self.y is not None:
            y = t.from_numpy(np.array([np.argmax(self.y.iloc[idx].to_numpy())]))
            return (x, y)
        return x

train_dataset = TextDataset(X, y)
print(train_dataset[0])

(tensor([[ 0.0128,  0.1005,  0.1389,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0070, -0.1526,  0.0809,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0022,  0.0444,  0.0961,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [-0.0084,  0.6786,  0.0484,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0083, -0.1639, -0.0927,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0347,  0.0854,  0.0024,  ...,  0.0000,  0.0000,  0.0000]]), tensor([0]))


In [84]:
model.train()

# loss_function = t.nn.NLLLoss()
loss_function = t.nn.CrossEntropyLoss()
optimizer = t.optim.Adam(model.parameters(), lr=0.001)
max_epochs = 50
batch_size = 400
_train = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
for i in range(0, max_epochs):
    for batch, (b_X, b_Y) in enumerate(_train):
        optimizer.zero_grad()
        b_ans = model.forward(b_X)
        loss = loss_function(b_ans, b_Y.flatten())
        loss.backward()
        optimizer.step()
        
        if batch == 0:
            loss, current = loss.item(), batch
            print(f"epoch: {i} loss: {loss:>7f}  {current:>5d}")
    
model.eval()

epoch: 0 loss: 1.070065      0
epoch: 1 loss: 1.053776      0
epoch: 2 loss: 0.980644      0
epoch: 3 loss: 0.920470      0
epoch: 4 loss: 0.887284      0
epoch: 5 loss: 0.848296      0
epoch: 6 loss: 0.832238      0
epoch: 7 loss: 0.810568      0
epoch: 8 loss: 0.879546      0
epoch: 9 loss: 0.755424      0
epoch: 10 loss: 0.752554      0
epoch: 11 loss: 0.770882      0
epoch: 12 loss: 0.774271      0
epoch: 13 loss: 0.769075      0
epoch: 14 loss: 0.771265      0
epoch: 15 loss: 0.774734      0
epoch: 16 loss: 0.741923      0
epoch: 17 loss: 0.730367      0
epoch: 18 loss: 0.737697      0
epoch: 19 loss: 0.770083      0
epoch: 20 loss: 0.787300      0
epoch: 21 loss: 0.778304      0
epoch: 22 loss: 0.752400      0
epoch: 23 loss: 0.724833      0
epoch: 24 loss: 0.685439      0
epoch: 25 loss: 0.703381      0
epoch: 26 loss: 0.724779      0
epoch: 27 loss: 0.792020      0
epoch: 28 loss: 0.641224      0
epoch: 29 loss: 0.588327      0
epoch: 30 loss: 0.706474      0
epoch: 31 loss: 0.

MyModel(
  (stack): Sequential(
    (0): Conv1d(300, 30, kernel_size=(1,), stride=(1,))
    (1): ELU(alpha=1.0)
    (2): Conv1d(30, 40, kernel_size=(3,), stride=(1,))
    (3): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): ELU(alpha=1.0)
    (5): Conv1d(40, 1, kernel_size=(3,), stride=(1,))
    (6): ELU(alpha=1.0)
    (7): Flatten(start_dim=1, end_dim=-1)
    (8): Linear(in_features=17, out_features=20, bias=True)
    (9): ELU(alpha=1.0)
    (10): Linear(in_features=20, out_features=3, bias=True)
    (11): ELU(alpha=1.0)
  )
)

In [85]:
x_test = TextDataset(test[['text']])

In [86]:
print(x_test[0])

tensor([[ 2.2001e-01,  1.1845e-01,  2.3413e-02,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [-6.8741e-01, -1.9885e-02,  6.4766e-02,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [-1.3924e-01, -3.2615e-02, -5.5659e-02,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        ...,
        [ 1.8861e-01,  2.0111e-02,  2.0082e-02,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 1.1401e-01, -8.4807e-03, -1.9653e-02,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [-1.9603e-02,  6.4655e-03,  1.7885e-04,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00]])


In [87]:
_data = DataLoader(x_test)
result=[]
for x in _data:
    result.append(model(x).detach().numpy())

In [88]:
result = np.array(result)

In [89]:
res = np.argmax(result, axis=2)

In [90]:
mp = {
    0: 'positive',
    1: 'negative',
    2: 'neutral'
}

In [91]:
pd.DataFrame({
    'textID': test['textID'],
    'sentiment': [mp[item] for item in res.astype(int).reshape(-1)]
}).to_csv('data/submissions/4-neural-network.csv', index=False)