In [1]:
import torch

In [2]:
%config Completer.use_jedi = False

In [3]:
from fastai.vision.all import *



In [4]:
import pdb

In [7]:
import numpy as np
from sklearn.decomposition import TruncatedSVD
from torchtext.vocab import build_vocab_from_iterator
import pandas as pd
from torchtext.vocab import GloVe
from torch.utils.data.dataloader import DataLoader 
from torch.utils.data import Dataset
from fastai.learner import Learner
import torch.nn as nn
from pdb import set_trace

In [6]:
def clean(comment):
    """
    This function receives comments and returns clean word-list
    """
    #Convert to lower case , so that Hi and hi are the same
    comment=comment.lower()
    #remove \n
    comment=re.sub("\\n","",comment)
    # remove leaky elements like ip,user
    comment=re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}","",comment)
    #removing usernames
    comment=re.sub("\[\[.*\]","",comment)
    return comment

In [8]:




class Comment_Dataset (Dataset):

    def __init__(self,csv_file='DATA/comment_csv_data/train.csv',max_rows=5000,min_word_freq=1,pretrained_emb=False,dim=50):
        """
        Args:
             img_file_list:list of image file extensions to be read
         
            preprocess (callable, optional): Optional preproc to be applied
                on a sample.
        """
        self.df=pd.read_csv(Path(csv_file))[:max_rows]
        self.comment_cats= self.df.columns[np.logical_not(np.isin(self.df.columns,["id","comment_text"]))]
        self.comment_list=[clean(sentence).split() for sentence in self.df['comment_text']]
        self.voc = build_vocab_from_iterator(self.comment_list, min_freq=1, specials=["<unk>"])
        self.word2num=self.voc.get_stoi()
        self.max_seq_len=max([len(comment) for comment in self.comment_list])
        self.pretrained=pretrained_emb
        if pretrained_emb:
            self.emb=GloVe(dim=dim,name='6B')
       
        
        
    

    def __len__(self):
        return len(self.df)


    def __getitem__(self, idx):
        ##if torch.is_tensor(idx):
         ##   idx = idx.tolist()
         comment=self.comment_list[idx]
         comment_label=torch.tensor(self.df[self.comment_cats].iloc[idx]).float()
        
         ## clean
        
         if self.pretrained:
            comment_emb=self.emb.get_vecs_by_tokens(comment, lower_case_backup=True).mean(dim=0)
            
         else:
             tokenized=torch.LongTensor([self.word2num[word] for word in comment])
             comment_emb=torch.cat((torch.zeros(self.max_seq_len-len(tokenized),dtype=torch.int64),tokenized))
          
            
         return( comment_emb,comment_label)
        
        
            
       

In [13]:
glove_embedd

(#5) [Path('DATA/15053_14.jpeg'),Path('DATA/comment_csv_data'),Path('DATA/Cyto_Segmentation_LATEST'),Path('DATA/data'),Path('DATA/test.csv')]

In [74]:
cds_pt=Comment_Dataset(max_rows=10000,pretrained_emb=True)

In [9]:
def get_test_train_ds(ds,train_pct=0.8):
    train_size = int(train_pct * len(ds))
    test_size = len(ds) - train_size
    train_dataset, test_dataset = torch.utils.data.random_split(ds, [train_size, test_size])
    return train_dataset, test_dataset




In [11]:
cds.word2num

{'pushes': 10543,
 'descent.': 9662,
 'excesive': 42473,
 'newspapers': 5283,
 '"wangi': 26306,
 '<unk>': 0,
 'inventor': 7223,
 'gotten': 2654,
 'disposal.': 17549,
 'you': 7,
 'age': 1189,
 'overall.': 55607,
 '(talk)': 280,
 '#cef2e0;': 26402,
 'message': 385,
 'all': 43,
 'it.talk': 48827,
 'talk:articles': 14325,
 'haslett,': 45817,
 'wars.': 6719,
 'cluba': 37164,
 'dramatic': 5633,
 'army': 1607,
 'sure,': 2949,
 'gees)': 44516,
 'persian-speaking': 13611,
 '")': 9096,
 'aidit': 31584,
 'continue': 311,
 '""[b]logs': 23412,
 'or': 25,
 'cunts': 8054,
 'abenaki,': 30818,
 'counting': 4111,
 '10': 1117,
 'servitude""': 61778,
 'tell.': 14344,
 '(utc)wp:or': 28220,
 'genius,': 9915,
 'helena': 18562,
 'ambrosius,': 32023,
 'credit': 2429,
 'rationale.': 10571,
 'deathscytheactually,': 39342,
 'to': 2,
 'i': 6,
 'block.': 1727,
 'discussion/talk': 40326,
 'repeatedly': 1228,
 'whitewashed': 69313,
 'phenomena': 13627,
 'forget': 1843,
 'rhetoric;': 60190,
 'council': 2527,
 "''spiel

In [130]:
torch.zeros(10,dtype=torch.int64)

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [20]:
# Create RNN Model
class Comment_LSTM(nn.Module):
    def __init__(self,max_comment_len,vocab_len, embedding_dim, hidden_dim, num_layers, num_comment_cats):
        super(Comment_LSTM, self).__init__()
        
        self.embedding=nn.Embedding(vocab_len,embedding_dim)
        
      
        # LSTM
        self.lstm=nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        
        # Comment category o/P layer
        self.fc = nn.Linear(hidden_dim,  num_comment_cats)
    
    def forward(self, x):
        
        # Initialize hidden state with zeros
       # h0 = Variable(torch.zeros(self.layer_dim, x.size(0), self.hidden_dim))
            
        # One time step
        embeddings=(x>0).unsqueeze(2)*self.embedding(x)
        output_seq, final_state = self.lstm(embeddings)
        hidden_final, cell_final = final_state
        ### For seq-seq (whole sequence of o/ps,uncomment following:)
       # out = self.fc(out)
        out_all_states=self.fc(output_seq.mean(dim=1))
        out_last_hidden=self.fc(hidden_final[-1])
        out_last_cell=self.fc(cell_final[-1])
        return out_last_hidden

In [91]:
class baseline_log_reg(nn.Module):
    def __init__(self,max_comment_len,vocab_len, embedding_dim,num_comment_cats):
        super(baseline_log_reg, self).__init__()
        self.embedding=nn.Embedding(vocab_len,embedding_dim)
        self.fc = nn.Linear(embedding_dim, num_comment_cats)
    
    def forward(self, x):
        
        # Initialize hidden state with zeros
       # h0 = Variable(torch.zeros(self.layer_dim, x.size(0), self.hidden_dim))
            
        # One time step
        sentence_lengths=(x>0).sum(dim=1)
        embeddings=(x>0).unsqueeze(2)*self.embedding(x)
        
        avg_cont=embeddings.sum(dim=1)/sentence_lengths.unsqueeze(1)
      
        ### For seq-seq (whole sequence of o/ps,uncomment following:)
       # out = self.fc(out)
        out_logits=self.fc(avg_cont)
        return out_logits
    
    

class baseline_log_reg_Pretrained(nn.Module):
    def __init__(self, embedding_dim,num_comment_cats):
        super(baseline_log_reg_Pretrained, self).__init__()
        
        self.fc = nn.Linear(embedding_dim, num_comment_cats)
    
    def forward(self, x):
        out_logits=self.fc(x)
        return out_logits    

    

In [17]:
embeddings=(x>0).unsqueeze(2)*embedding(x)
       

NameError: name 'self' is not defined

In [86]:
train_dl,test_dl=[DataLoader(dataset,batch_size=256,shuffle=True) for dataset in get_test_train_ds(cds_pt)]

tensor([[    0,     0,     0,  ...,   968,  5759, 37915],
        [    0,     0,     0,  ..., 24190,  6808,    23],
        [    0,     0,     0,  ...,   675,  2321,    23],
        ...,
        [    0,     0,     0,  ...,    30,   189,   157],
        [    0,     0,     0,  ...,     1,    65, 12129],
        [    0,     0,     0,  ...,     5,  1761, 22341]], device='cuda:0')

In [88]:
x,y=next(iter(train_dl))

x.shape

torch.Size([256, 50])

In [25]:
def get_baseline_model(cds,embedding_dim=300):
    return baseline_log_reg(max_comment_len=cds.max_seq_len,vocab_len=len(cds.word2num),embedding_dim=embedding_dim,num_comment_cats=len(cds.comment_cats))

In [92]:
baseline_pt=baseline_log_reg_Pretrained(embedding_dim=cds_pt.emb.dim,num_comment_cats=len(cds_pt.comment_cats))



In [93]:
comment_cats={cat:i for i,cat in enumerate(cds.comment_cats)}

In [94]:
from functools import partial

In [109]:
accuracy_toxic=partial(accuracy_category,cat='toxic')
accus=[partial(accuracy_category,cat=cat) for cat in comment_cats]
accus

[functools.partial(<function accuracy_category at 0x0000028D86220550>, cat='toxic'),
 functools.partial(<function accuracy_category at 0x0000028D86220550>, cat='severe_toxic'),
 functools.partial(<function accuracy_category at 0x0000028D86220550>, cat='obscene'),
 functools.partial(<function accuracy_category at 0x0000028D86220550>, cat='threat'),
 functools.partial(<function accuracy_category at 0x0000028D86220550>, cat='insult'),
 functools.partial(<function accuracy_category at 0x0000028D86220550>, cat='identity_hate')]

In [96]:
def accuracy_category(, inp,targ,cat, thresh=0.5, sigmoid=True):
    "Compute accuracy when `inp` and `targ` are the same size."
    cat_offset=comment_cats[cat]
    inp,targ=inp[:,cat_offset],targ[:,cat_offset]
    inp,targ = flatten_check(inp,targ)
    if sigmoid: inp = inp.sigmoid()
    return ((inp>thresh)==targ.bool()).float().mean()

In [9]:
train_size = int(0.8 * len(cds))
test_size = len(cds) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(cds, [train_size, test_size])

In [13]:
cmt_lstm=Comment_LSTM(max_comment_len=cds.max_seq_len,vocab_len=len(cds.word2num),embedding_dim=25,hidden_dim=100,num_layers=2,num_comment_cats=len(cds.comment_cats))

In [107]:
baseline_Learner_pt=Learner(dls=DataLoaders(train_dl,test_dl),model=baseline_pt,loss_func=nn.BCEWithLogitsLoss(),metrics=[accuracy_multi]+accus)


In [108]:
baseline_Learner_pt.fit(10)

epoch,train_loss,valid_loss,accuracy_multi,accuracy_category,accuracy_category.1,accuracy_category.2,accuracy_category.3,accuracy_category.4,accuracy_category.5,time
0,0.169061,0.159624,0.963167,0.904,0.9865,0.947,0.998,0.9505,0.993,00:10
1,0.159248,0.150855,0.96325,0.904,0.9865,0.947,0.998,0.9505,0.9935,00:10
2,0.151188,0.144637,0.963167,0.904,0.9865,0.9465,0.998,0.9505,0.9935,00:10
3,0.146565,0.139945,0.963333,0.905,0.9865,0.9465,0.998,0.9505,0.9935,00:10
4,0.142581,0.136216,0.9635,0.906,0.9865,0.9465,0.998,0.9505,0.9935,00:11
5,0.139387,0.133093,0.96375,0.9075,0.9865,0.9465,0.998,0.9505,0.9935,00:10
6,0.135238,0.130467,0.964083,0.909,0.9865,0.947,0.998,0.9505,0.9935,00:11
7,0.131817,0.12819,0.964167,0.909,0.9865,0.9475,0.998,0.9505,0.9935,00:10
8,0.128684,0.126204,0.964167,0.909,0.9865,0.9475,0.998,0.9505,0.9935,00:11
9,0.125856,0.124397,0.964083,0.9085,0.9865,0.9475,0.998,0.9505,0.9935,00:10


In [50]:
x[0].unsqueeze(0)

tensor([  0,   0,   0,  ...,   3, 370,  22])

In [26]:
(y.sum(dim=1)>0).sum()

tensor(34)

In [58]:
baseline_Learner.model(x[0].unsqueeze(0))
F

<module 'torch.nn.functional' from 'D:\\Anaconda3\\envs\\deeperlearning\\lib\\site-packages\\torch\\nn\\functional.py'>

In [65]:
with torch.no_grad():
    out=baseline_Learner.model(x[0].unsqueeze(0))
    out=F.softmax(out,dim=1)

out.sum()

tensor([[0.0746, 0.1133, 0.0045, 0.1096, 0.1371, 0.1697]])


tensor(1.)

In [211]:
bce_loss=nn.BCEWithLogitsLoss()

In [216]:
bce_loss(out,y.float())

tensor(0.6883, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)

In [215]:
y.float()

tensor([[0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0

In [10]:
x,y=next(iter(train_dl))

In [11]:
x.shape

torch.Size([256, 1400])

In [21]:
cmt_baseline=baseline_log_reg(max_comment_len=cds.max_seq_len,vocab_len=len(cds.word2num),embedding_dim=300,num_comment_cats=len(cds.comment_cats))

In [42]:
y

tensor([[0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        ...,
        [0., 0., 0., 0., 0., 0.],
        [1., 0., 1., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0.]], device='cuda:0')

In [62]:
voc.get_stoi()

{'dash.': 68400,
 '/ts/.)': 229274,
 'descent.': 13757,
 '(Obviously,': 219208,
 'Nobel""': 144049,
 'problem...I': 174341,
 'pushes': 16863,
 'Phdarts': 145648,
 '""Samraj""': 118222,
 'http://www.usatoday.com/tech/columnist/kevinmaney/2006-09-12-wisdom-of-crowds_x.htm': 441908,
 'newspapers': 4363,
 '1.TABFirst': 124629,
 'Gullible': 292368,
 'Pot-Bouille': 334349,
 'excesive': 163678,
 'INFORMATION.': 23057,
 'air,': 15354,
 'turk': 48354,
 'sook..': 498800,
 '<unk>': 0,
 '17:24,': 91806,
 'Darchinyan': 95203,
 'CheeseDreams,': 268278,
 'madness.': 47468,
 'Sutra:': 353468,
 'Universities)': 363718,
 'coarse': 34259,
 'reread': 12362,
 '✄TAB': 37721,
 'leave,': 17675,
 'Army': 2897,
 'Supergabbyshoe': 353216,
 '14:46,': 91698,
 'Or': 839,
 'editor..': 162860,
 'vaccine': 13639,
 'Plaster': 333392,
 'strancp': 501830,
 'misguided': 8742,
 'Jersey!': 77637,
 'ANCIENTS': 248306,
 'smothered': 497717,
 'feedback.': 7521,
 '""...accepted': 184752,
 'tough....': 511395,
 'service': 1731,


In [22]:
com=df_train['comment_text'][0]
print(clean(com))

TypeError: unhashable type: 'list'

In [25]:
def clean(comment):
    """
    This function receives comments and returns clean word-list
    """
    #Convert to lower case , so that Hi and hi are the same
    comment=comment.lower()
    #remove \n
    comment=re.sub("\\n","",comment)
    # remove leaky elements like ip,user
    comment=re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}","",comment)
    #removing usernames
    comment=re.sub("\[\[.*\]","",comment)
    return comment

In [None]:
cds.df[]

In [1]:
1

1

In [5]:
accu

NameError: name 'cds' is not defined