In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('Pheme.csv')
X = df['Text']
y = df['Label']

X.head(), y.head()

(0    #BREAKING: A Germanwings Airbus A320 has crash...
 1    Updated numbers @AP: BREAKING: #Germanwings CE...
 2    @YanniKouts @germanwings @flightradar24 A bit ...
 3    #Germanwings co-pilot suffered serious depress...
 4    @ThisIsGaZa إِنَّا لِلّهِ وَإِنَّـا إِلَيْهِ ر...
 Name: Text, dtype: object,
 0    rumours
 1    rumours
 2    rumours
 3    rumours
 4    rumours
 Name: Label, dtype: object)

In [13]:
type(y)

pandas.core.series.Series

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=2)

In [22]:
len(X_train), len(y_train)

(5140, 5140)

In [23]:
len(X_test), len(y_test)

(1286, 1286)

In [24]:
type(y_test)

pandas.core.series.Series

In [25]:
def label(x):
    if x == 'rumours':
        x = 1
    elif x == 'non-rumours':
        x = 0
    return x

y_test = y_test.apply(lambda x: label(x))
y_test

3408    0
577     1
3077    0
6308    1
2558    0
       ..
179     1
3291    0
2435    0
5750    0
6188    0
Name: Label, Length: 1286, dtype: int64

In [30]:
import torch

torch.tensor(y_test.values)

tensor([0, 1, 0,  ..., 0, 0, 0])

In [31]:
y = y.apply(lambda x: label(x))
torch.tensor(y.values)

tensor([1., 1., 1.,  ..., 0., 0., 0.], dtype=torch.float64)

In [1]:
import torch
import torch.nn as nn

`torch.nn.Embedding`:

**torch.nn.Emebdding(num_embeddings, embedding_dim)**

In [3]:
embedding = nn.Embedding(5, 4) # total 5 words, each words have 4 dims
'''
the shape is (2, 3).
        |||
Word is like a batch, the batch size is 2, which means we have two data sample.
Each data sample have 3 words, which equals to the padding length or num_steps.

'''
word = [[1, 2, 3], 
        [2, 3, 4]]

embed = embedding(torch.LongTensor(word))
embed, embed.size(), embed.shape

(tensor([[[-1.6270, -0.5891,  1.0201,  0.2022],
          [-1.6835, -1.4850, -0.5055,  0.4733],
          [ 1.8186,  0.6548, -0.1640,  0.4837]],
 
         [[-1.6835, -1.4850, -0.5055,  0.4733],
          [ 1.8186,  0.6548, -0.1640,  0.4837],
          [-0.1582,  1.0103,  0.4638,  0.9874]]], grad_fn=<EmbeddingBackward0>),
 torch.Size([2, 3, 4]),
 torch.Size([2, 3, 4]))

The output is `(2, 3, 4)`.
Which is **(batch_size, padding_length, word_dims)**.



In [4]:
glove_embedding_path = '../../glove.6B.100d/vec.txt'

In [7]:
with open(glove_embedding_path, 'r') as f:
    for line in f:
        a = line
        print(a)
        break

the -0.038194 -0.24487 0.72812 -0.39961 0.083172 0.043953 -0.39141 0.3344 -0.57545 0.087459 0.28787 -0.06731 0.30906 -0.26384 -0.13231 -0.20757 0.33395 -0.33848 -0.31743 -0.48336 0.1464 -0.37304 0.34577 0.052041 0.44946 -0.46971 0.02628 -0.54155 -0.15518 -0.14107 -0.039722 0.28277 0.14393 0.23464 -0.31021 0.086173 0.20397 0.52624 0.17164 -0.082378 -0.71787 -0.41531 0.20335 -0.12763 0.41367 0.55187 0.57908 -0.33477 -0.36559 -0.54857 -0.062892 0.26584 0.30205 0.99775 -0.80481 -3.0243 0.01254 -0.36942 2.2167 0.72201 -0.24978 0.92136 0.034514 0.46745 1.1079 -0.19358 -0.074575 0.23353 -0.052062 -0.22044 0.057162 -0.15806 -0.30798 -0.41625 0.37972 0.15006 -0.53212 -0.2055 -1.2526 0.071624 0.70565 0.49744 -0.42063 0.26148 -1.538 -0.30223 -0.073438 -0.28312 0.37104 -0.25217 0.016215 -0.017099 -0.38984 0.87424 -0.72569 -0.51058 -0.52028 -0.1459 0.8278 0.27062



In [9]:
type(a)

str

In [11]:
elems = a.rstrip().split(' ')
elems[0:3]

['the', '-0.038194', '-0.24487']

In [16]:
# Tensor can be slicing

l = [[1, 2, 3], 
     [2, 3, 4],
     [3, 3, 1]]
l = torch.tensor(l)

l[torch.tensor([0, 1])]

tensor([[1, 2, 3],
        [2, 3, 4]])

In [20]:
class TokenEmbedding:
    def __init__(self, embedding_path):
        '''
        We need to create three things:
        1. idx_to_token
        2. idx_to_vec
        3. token_to_idx
        '''
        self.idx_to_token, self.idx_to_vec = self._load_embedding(embedding_path)
        self.unknown_idx = 0
        self.token_to_idx = {token: idx for idx, token in enumerate(self.idx_to_token)}
    
    def _load_embedding(self, embedding_path):
        idx_to_token, idx_to_vec = ['<unk>'], []
        with open(embedding_path, 'r') as f:
            for line in f:
                elems = line.rstrip().split(' ')
                token, elems = elems[0], [float(elem) for elem in elems[1:]]
                # of course it will always be greater than one!!!
                if len(elems) > 1:
                    idx_to_token.append(token)
                    idx_to_vec.append(elems)
        '''
        After that we need to add ['<unk>'] using all 0 vectors.
        '''
        idx_to_vec = [[0] * len(idx_to_vec[0])] + idx_to_vec
        return idx_to_token, torch.tensor(idx_to_vec)
    
    def __getitem__(self, tokens):
        indices = [self.token_to_idx.get(token, self.unknown_idx)
                   for token in tokens]
        vecs = self.idx_to_vec[torch.tensor(indices)]
        return vecs
    
    def __len__(self):
        return len(self.idx_to_token)

In [21]:
glove_embedding = TokenEmbedding(glove_embedding_path)

What the `TokenEmbedding` class does is that:

It takes token's idx in the `Vocab` class, 
and use `__getitem__` in itself, mapping them to vecs.