0. Initial setting

In [None]:
import numpy as np
import pandas as pd
import urllib.request

urllib.request.urlretrieve("https://raw.githubusercontent.com/mohitgupta-omg/Kaggle-SMS-Spam-Collection-Dataset-/master/spam.csv", filename="spam.csv")
data = pd.read_csv('spam.csv', encoding='latin1')

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

In [4]:
del data['Unnamed: 2']
del data['Unnamed: 3']
del data['Unnamed: 4']

data['v1'] = data['v1'].replace(['ham','spam'],[0,1])
data['text'] = data['v2']
data['isSpam'] = data['v1']

del data['v1'], data['v2']

print(f'Data Shape: {data.shape}')
# imbalanced data
print(data['isSpam'].value_counts())
data.head()

Data Shape: (5572, 2)
0    4825
1     747
Name: isSpam, dtype: int64


Unnamed: 0,text,isSpam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [5]:
from sklearn.model_selection import train_test_split

X, y = data['text'], data['isSpam']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0,
                                                   stratify=y, test_size=0.1)

print(len(X_train), len(X_test))

5014 558


In [18]:
print(y_train.value_counts())
print(y_test.value_counts())

0    4342
1     672
Name: isSpam, dtype: int64
0    483
1     75
Name: isSpam, dtype: int64


1. Preprocessing

In [None]:
import re
import nltk

In [31]:
def preprocess(string: str, *args, **kwargs) -> str:
    string = re.sub('[^a-zA-Z]', ' ', string)
    string = re.sub('[^\w\s]', ' ', string)
    string = string.lower()
    
    shortword = re.compile(r'\W*\b\w{1,2}\b')
    string = shortword.sub('', string)
    
    return string

X_pre = X_train.apply(preprocess)
print(X_pre)

5448        aight can pick some  you open before tonight 
1707    was doing test earlier appreciate you  will ca...
2117    wish many many returns the day   happy birthda...
1357    good afternoon loverboy   how goes you day   a...
787     ever thought about living good life with perfe...
                              ...                        
1054    hiya comin   bristol week april  les got off  ...
245     too late said have the website didn have dont ...
1235    your opinion about     over    jada    kusruth...
3361                       messages her phone holding now
5071    win       shopping spree every week starting n...
Name: text, Length: 5014, dtype: object


2. Tokenizing

In [None]:
from nltk.tokenize import word_tokenize  
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')

In [37]:
def tokenize(string: str, *args, **kwargs) -> list:
    token = word_tokenize(string)
    stop_words = set(stopwords.words('english'))
    res = []
    for x in token:
        if x not in stop_words:
            res.append(x)
    
    return res

X_token = X_pre.apply(tokenize)
print(X_token)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\socal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\socal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


5448                         [aight, pick, open, tonight]
1707          [test, earlier, appreciate, call, tomorrow]
2117    [wish, many, many, returns, day, happy, birthd...
1357    [good, afternoon, loverboy, goes, day, luck, c...
787     [ever, thought, living, good, life, perfect, p...
                              ...                        
1054    [hiya, comin, bristol, week, april, les, got, ...
245                 [late, said, website, dont, slippers]
1235    [opinion, jada, kusruthi, lovable, silent, spl...
3361                           [messages, phone, holding]
5071    [win, shopping, spree, every, week, starting, ...
Name: text, Length: 5014, dtype: object


3. Building Vocabulary

In [42]:
from collections import Counter

[('call', 545), ('get', 352), ('free', 261), ('know', 240), ('got', 230)]


In [48]:
def build_vocab(n, *args, **kwargs):
    voca = Counter(words)    
    vocab = voca.most_common(n)
    
    dict = {word: index+2 for index, (word, count) in enumerate(vocab)}
    dict['padding_idx'] = 0
    dict['unk_idx'] = 1
    
    return dict

dictionary = build_vocab(100)
print(dictionary)

{'call': 2, 'get': 3, 'free': 4, 'know': 5, 'got': 6, 'good': 7, 'day': 8, 'like': 9, 'come': 10, 'time': 11, 'love': 12, 'send': 13, 'want': 14, 'text': 15, 'txt': 16, 'one': 17, 'going': 18, 'need': 19, 'today': 20, 'home': 21, 'still': 22, 'lor': 23, 'stop': 24, 'sorry': 25, 'see': 26, 'back': 27, 'dont': 28, 'mobile': 29, 'reply': 30, 'tell': 31, 'take': 32, 'later': 33, 'new': 34, 'please': 35, 'think': 36, 'pls': 37, 'week': 38, 'phone': 39, 'dear': 40, 'night': 41, 'well': 42, 'msg': 43, 'great': 44, 'hey': 45, 'much': 46, 'happy': 47, 'hope': 48, 'wat': 49, 'claim': 50, 'way': 51, 'make': 52, 'yes': 53, 'work': 54, 'give': 55, 'na': 56, 'www': 57, 'wan': 58, 'right': 59, 'message': 60, 'already': 61, 'say': 62, 'prize': 63, 'number': 64, 'said': 65, 'ask': 66, 'cash': 67, 'win': 68, 'yeah': 69, 'tomorrow': 70, 'find': 71, 'meet': 72, 'amp': 73, 'babe': 74, 'let': 75, 'really': 76, 'cos': 77, 'life': 78, 'would': 79, 'every': 80, 'com': 81, 'miss': 82, 'thanks': 83, 'last': 84, 

4. to Tensor

In [49]:
import torch
from torch.autograd import Variable

In [59]:
def toTensor(max_len, *args, **kwargs) -> torch.LongTensor:
    encoding = []
    for seq in X_pre:
        encoding.append([dictionary.get(word,1) for word in seq.split()])
    
    seq_length = torch.LongTensor(list(map(len, encoding)))
    
    seq_tensor = Variable(torch.zeros((len(encoding), seq_length.max()))).long()
    for idx, (seq, seqlen) in enumerate(zip(encoding, seq_length)):
        if seqlen < max_len:
            seq_tensor[idx, :seqlen] = torch.LongTensor(seq)
        else:
            seq_tensor[idx, :max_len] = torch.LongTensor(seq[:max_len])

    return encoding, seq_tensor, seq_length

encoding, seq_tensor, seq_length = toTensor(100)
print(encoding, seq_tensor, seq_length)

[[1, 1, 88, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 2, 1, 70], [1, 1, 1, 1, 1, 8, 47, 1, 1], [7, 1, 1, 1, 1, 1, 8, 1, 1, 10, 1, 51, 36, 1, 1, 1, 13, 12, 1, 1, 1, 52, 1, 1, 1, 47], [1, 1, 1, 1, 7, 78, 1, 1, 1, 1, 16, 27, 1, 1, 1, 1, 1, 29, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 75, 1, 5], [1, 1, 1, 85], [1, 1, 2], [1, 8, 1, 68, 1, 1, 1, 1, 32, 1, 1, 1, 1, 1, 13, 1, 43, 1, 1], [86, 1, 1, 1, 1], [1, 65, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [25, 1, 1, 1, 1], [86, 1, 1, 1, 1, 1, 1, 18, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 38, 1, 34, 1], [1, 1, 1, 1, 1, 1, 1], [29, 1, 1, 1, 1, 98, 1, 57, 1, 81, 44, 1, 13, 4, 1, 1, 1, 1, 1, 1, 81], [1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 99, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1], [1, 1, 1, 1, 1, 11, 1, 61], [3, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [23], [35, 1, 1, 39], [1, 1, 1, 1, 1, 1, 32, 1, 1, 1, 1, 1, 1], [1, 1, 1, 55, 1], [1, 82, 1], [1, 1, 1, 79, 1, 9, 1], [37, 1, 27, 1, 1, 1, 1, 1, 1, 1, 1, 27, 1], [10, 5

In [None]:
class MailDataset(Dataset):
    '''
    your code here
    '''

dataset = MailDataset() # your code
train_loader = DataLoader(dataset) # your code

In [None]:

class SpamClassifier(nn.Module):
    '''
    your code here
    '''

def train():
    '''
    your code here
    '''

def eval():
    '''
    your code here
    '''