In [1]:
pip install torchtext



In [2]:
import torch
from torchtext import data

In [3]:
SEED = 2021
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True  

In [109]:
import pandas as pd

train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

test_df = test_df[['text', 'id']]
print(test_df)

                                                   text     id
0                    Just happened a terrible car crash      0
1     Heard about #earthquake is different cities, s...      2
2     there is a forest fire at spot pond, geese are...      3
3              Apocalypse lighting. #Spokane #wildfires      9
4         Typhoon Soudelor kills 28 in China and Taiwan     11
...                                                 ...    ...
3258  EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...  10861
3259  Storm in RI worse than last hurricane. My city...  10865
3260  Green Line derailment in Chicago http://t.co/U...  10868
3261  MEG issues Hazardous Weather Outlook (HWO) htt...  10874
3262  #CityofCalgary has activated its Municipal Eme...  10875

[3263 rows x 2 columns]


In [113]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
train_df = train_df[['text', 'target']]
'''
train_df = train_df[['text', 'target']]
targets = le.fit_transform(train_df)
print(targets)
'''
# preprocess
data_set = []
for idx, text, target in train_df.itertuples():
  data_set.append((target, text))

data_set_iterable = iter(data_set)
#next(data_set)

test_set = []
for idx, target, id in test_df.itertuples():
  test_set.append((id, target))

#test_set = iter(test_set)
#next(test_set)
print(test_set)



0                      Just happened a terrible car crash
1       Heard about #earthquake is different cities, s...
2       there is a forest fire at spot pond, geese are...
3                Apocalypse lighting. #Spokane #wildfires
4           Typhoon Soudelor kills 28 in China and Taiwan
                              ...                        
3258    EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259    Storm in RI worse than last hurricane. My city...
3260    Green Line derailment in Chicago http://t.co/U...
3261    MEG issues Hazardous Weather Outlook (HWO) htt...
3262    #CityofCalgary has activated its Municipal Eme...
Name: text, Length: 3263, dtype: object


In [29]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
tokenizer = get_tokenizer('basic_english')

def yield_tokens(data_iter):
  print(data_iter)
  for _, text in data_iter:
    yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(data_set), specials=["<unk>"])




In [30]:
vocab.set_default_index(vocab["<unk>"])

In [31]:
from torchtext.vocab import Vocab
Vocab.get_stoi(vocab)

{'ì¢': 23542,
 'åèmgn-africaå¨': 23538,
 'å¬': 23537,
 'å¤}': 23535,
 'å£9': 23534,
 '\x89û÷the': 23526,
 '\x89û÷second\x89ûª': 23525,
 '\x89û÷muslim': 23521,
 '\x89û÷let\x89ûªs': 23519,
 '\x89û÷leaves': 23518,
 '\x89û÷ill': 23514,
 '\x89û÷hoax': 23513,
 '\x89û÷hazard\x89ûª': 23510,
 '\x89û÷food': 23509,
 '\x89û÷devastated\x89ûª': 23503,
 '\x89û÷body': 23500,
 '\x89û÷badges': 23499,
 '\x89ûókody': 23494,
 '\x89ûïthat\x89ûªs': 23490,
 '\x89ûïplans': 23488,
 '\x89ûïparties': 23487,
 '\x89ûïnumbers': 23486,
 '\x89ûïmake': 23483,
 '\x89ûïlove': 23482,
 '\x89ûïi': 23480,
 '\x89ûïfor': 23479,
 '\x89ûïcat': 23477,
 '\x89ûïairplane\x89û\x9d': 23475,
 '\x89ûïafter': 23474,
 '\x89ûï@macdaddy_leo': 23470,
 '\x89ûï@leoblakecarter': 23467,
 '\x89ûï@fdny': 23465,
 '\x89ûï@dylanmcclure55': 23464,
 '\x89ûï@_keits': 23460,
 '\x89û¢i': 23457,
 '\x89ã¢': 23454,
 '~still': 23452,
 '~3': 23449,
 '~27%': 23448,
 '~11%': 23447,
 '}': 23446,
 '|lauren': 23443,
 '{': 23441,
 'zzzz': 23440,
 'zumiez': 23438,
 '

In [115]:
text_pipeline = lambda x : vocab(tokenizer(x))
label_pipeline = lambda x: int(x)

In [33]:
text_pipeline('weaknesses webinar')


[23139, 23143]

In [70]:
from torch.utils.data import DataLoader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_label, _text) in batch:
         label_list.append(label_pipeline(_label))
         processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
         text_list.append(processed_text)
         offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)

dataloader = DataLoader(data_set, batch_size=64, shuffle=False, collate_fn=collate_batch)
print(dataloader)

<torch.utils.data.dataloader.DataLoader object at 0x7f5f6b638d50>


In [38]:
from torch import nn

class TextClassificationModel(nn.Module):

    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

In [39]:
num_class = len(set([label for (label, text) in data_set]))
vocab_size = len(vocab)
emsize = 64
model = TextClassificationModel(vocab_size, emsize, num_class).to(device)

In [74]:
import time

def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        predited_label = model(text, offsets)
        loss = criterion(predited_label, label) # this line is broken
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predited_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()


In [60]:
def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predited_label = model(text, offsets)
            loss = criterion(predited_label, label)
            total_acc += (predited_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

In [83]:
from torchtext.data.functional import to_map_style_dataset
from torch.utils.data.dataset import random_split

EPOCHS = 10 # epoch
LR = 5  # learning rate
BATCH_SIZE = 64 # batch size for training

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=.1)
total_accu = None
train_dataset = to_map_style_dataset(data_set)
test_dataset = to_map_style_dataset(test_set)

num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = \
    random_split(train_dataset, [num_train, len(train_dataset) - num_train])

train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE, shuffle=True,  collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
# print(train_df)
# train = next(iter(train_dataloader))



for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader)
    accu_val = evaluate(valid_dataloader)
    if total_accu is not None and total_accu > accu_val:
      scheduler.step()
    else:
       total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)

-----------------------------------------------------------
| end of epoch   1 | time:  0.62s | valid accuracy    0.961 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   2 | time:  0.61s | valid accuracy    0.971 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   3 | time:  0.59s | valid accuracy    0.942 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   4 | time:  0.59s | valid accuracy    0.974 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   5 | time:  0.62s | valid accuracy    0.974 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   6 | time:  0.59s |

In [84]:
def collate_test(batch):
  text_list, offsets = [], [0]
  for _text in batch:
    processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
    text_list.append(processed_text)
    offsets.append(processed_text.size(0))
  offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
  text_list = torch.cat(text_list)
  return  text_list.to(device), offsets.to(device)

In [93]:
test_dataloader = DataLoader(data_set, batch_size=64, shuffle=False, collate_fn=collate_test)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_test)

In [94]:
def test_evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0
    predicted_labels = []
    with torch.no_grad():
        for idx, (text, offsets) in enumerate(dataloader):
            predited_label = model(text, offsets)
            predicted_labels.append(predited_label)
    return predicted_labels

In [98]:
def predict(text, text_pipeline):
    with torch.no_grad():
        text = torch.tensor(text_pipeline(text))
        output = model(text, torch.tensor([0]))
        return output.argmax(1).item()

In [123]:
print('Checking the results of test dataset.')
test_labels = test_evaluate(test_dataloader)
test_predictions = []
for id, text in test_set:
  test_predictions.append((id, predict(text, text_pipeline)))

print(test_predictions)

Checking the results of test dataset.
[(0, 1), (2, 1), (3, 1), (9, 1), (11, 1), (12, 1), (21, 0), (22, 0), (27, 0), (29, 0), (30, 0), (35, 1), (42, 0), (43, 0), (45, 0), (46, 0), (47, 0), (51, 1), (58, 0), (60, 0), (69, 0), (70, 0), (72, 0), (75, 1), (84, 0), (87, 1), (88, 0), (90, 1), (94, 0), (99, 1), (101, 0), (103, 1), (106, 1), (108, 0), (111, 1), (115, 0), (116, 0), (122, 0), (123, 0), (124, 1), (125, 0), (127, 0), (140, 1), (142, 1), (147, 0), (148, 0), (150, 0), (152, 0), (154, 0), (155, 0), (166, 0), (167, 0), (169, 1), (177, 0), (179, 0), (181, 0), (186, 0), (188, 0), (189, 0), (192, 0), (200, 1), (202, 0), (206, 1), (207, 1), (214, 1), (217, 1), (223, 0), (224, 1), (227, 1), (228, 0), (230, 0), (233, 1), (234, 1), (236, 1), (239, 1), (250, 1), (255, 0), (257, 0), (259, 1), (275, 1), (278, 0), (282, 0), (284, 0), (286, 1), (288, 1), (292, 0), (295, 0), (300, 1), (304, 1), (305, 1), (306, 0), (308, 0), (311, 0), (317, 0), (319, 0), (323, 0), (324, 0), (325, 0), (326, 0), (333,

In [131]:
df = pd.DataFrame(test_predictions, columns=['id', 'target'])
df

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [135]:

df.to_csv('submissiontwo.csv', index=False)

In [136]:
compression_opts = dict(method='zip',

                        archive_name='out.csv')  

df.to_csv('out.zip', index=False,

          compression=compression_opts)  