In [39]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as f
import spacy
import sklearn as sl

In [40]:
word_embeddings = spacy.load('en', vectors='glove.6B.300d.txt')

In [59]:
def sequence_to_data(seq, max_len=None):    ####Converting sequence to data basically convertig words to vectors
    seq = unicode(seq, 'utf-8')
    data = [word_embeddings(ix).vector for ix in seq.split()]
    if max_len == None:
        max_len = len(data)
        
    data_mat = np.zeros((1, max_len, 384))
    for ix in range(min(max_len, len(data))):
        data_mat[:, ix, :] = data[ix]

    return data_mat

def seq_data_matrix(seq_data, max_len=None):  ####Now Concating different sentences and converting to a matrix
    data = np.concatenate([sequence_to_data(ix, max_len) for ix in seq_data], axis=0)
    return data

In [42]:
df = pd.read_csv("/Users/adityakumar/Desktop/dataset/final.csv")#loading dataset

In [43]:
df.head()

Unnamed: 0.1,Unnamed: 0,S.No.,username,label,tweet,preprocessed tweet
0,0,1,mipjournal,0,This 23-year-old woman just opened India's fir...,woman open india first rehab clinic acid attac...
1,1,2,abigailsims1,0,Preeti Rathi case: Death for India acid attack...,preeti rathi case death india acid attack convict
2,2,3,aazaadee,0,@Pedal_India @laxit_parsana @StopAcidAttacks T...,babi joy sonali mukherje india whose skin melt...
3,3,4,vbd2015,0,Russian acid attack victim goes back; India to...,russian acid attack victim goe back india pay ...
4,4,5,chrisproberts,0,#India is ignoring its growing #acidattack cri...,india ignor grow acidattack crisi brave surviv...


In [44]:
df = df.drop('Unnamed: 0', axis=1)
df = df.dropna()
df = df.reset_index(drop=True)

In [45]:
df.head()


Unnamed: 0,S.No.,username,label,tweet,preprocessed tweet
0,1,mipjournal,0,This 23-year-old woman just opened India's fir...,woman open india first rehab clinic acid attac...
1,2,abigailsims1,0,Preeti Rathi case: Death for India acid attack...,preeti rathi case death india acid attack convict
2,3,aazaadee,0,@Pedal_India @laxit_parsana @StopAcidAttacks T...,babi joy sonali mukherje india whose skin melt...
3,4,vbd2015,0,Russian acid attack victim goes back; India to...,russian acid attack victim goe back india pay ...
4,5,chrisproberts,0,#India is ignoring its growing #acidattack cri...,india ignor grow acidattack crisi brave surviv...


In [46]:
df['len'] = ' '   ###Adding column 'len' for no. of words in preprocessed tweet

In [47]:
for ix in range(df.shape[0]):    ###Now assigning values in cells of column of 'len'
    a = len(str(df['preprocessed tweet'].loc[ix]).split())
    df.loc[ix, 'len'] = a

In [48]:
bucket_sizes = [[0, 5], [5, 10], [10, 15], [15, 20], [20, 25], [25, 35]]

def assign_bucket(x):       ###making buckets of different sizes and assigning bucket to tweets according to their 'len'
    for bucket in bucket_sizes:
        if x>=bucket[0] and x<=bucket[1]:
            return bucket_sizes.index(bucket)
    return len(bucket_sizes)-1

In [49]:
df['bucket'] = df.len.apply(assign_bucket)
df.head()

Unnamed: 0,S.No.,username,label,tweet,preprocessed tweet,len,bucket
0,1,mipjournal,0,This 23-year-old woman just opened India's fir...,woman open india first rehab clinic acid attac...,9,1
1,2,abigailsims1,0,Preeti Rathi case: Death for India acid attack...,preeti rathi case death india acid attack convict,8,1
2,3,aazaadee,0,@Pedal_India @laxit_parsana @StopAcidAttacks T...,babi joy sonali mukherje india whose skin melt...,11,2
3,4,vbd2015,0,Russian acid attack victim goes back; India to...,russian acid attack victim goe back india pay ...,15,2
4,5,chrisproberts,0,#India is ignoring its growing #acidattack cri...,india ignor grow acidattack crisi brave surviv...,9,1


In [50]:
df = df.sort_values(by=['bucket'])
df.head()

Unnamed: 0,S.No.,username,label,tweet,preprocessed tweet,len,bucket
1008,1010,stjagadeesh,0,http://www.ddinews.gov.in/National/National%20...,india,1,0
4450,4463,heyramani,0,@iamsrk Acid Attack in India rise by 300% in 3...,iamsrk acid attack india rise,5,0
3723,3734,faryaalshakeel,0,Death for India acid attack convict http://dl...,death india acid attack convict,5,0
5698,5711,masala_chaas,0,@ShareThis 1 more acid attack on girls in Indi...,acid attack girl india via,5,0
3703,3714,dw_urdu,0,Ù,ù,1,0


In [51]:
df['preprotweet'] = df['preprocessed tweet']

In [52]:
df = df.drop('preprocessed tweet', axis=1)

In [53]:
df.head()

Unnamed: 0,S.No.,username,label,tweet,len,bucket,preprotweet
1008,1010,stjagadeesh,0,http://www.ddinews.gov.in/National/National%20...,1,0,india
4450,4463,heyramani,0,@iamsrk Acid Attack in India rise by 300% in 3...,5,0,iamsrk acid attack india rise
3723,3734,faryaalshakeel,0,Death for India acid attack convict http://dl...,5,0,death india acid attack convict
5698,5711,masala_chaas,0,@ShareThis 1 more acid attack on girls in Indi...,5,0,acid attack girl india via
3703,3714,dw_urdu,0,Ù,1,0,ù


In [54]:
def make_batch(data, batch_size=10, gpu=False):# making batches to pass in model during training
    for bx in range(len(bucket_sizes)):
        bucket_data = df[(df.bucket==bx)].reset_index(drop=True)
        
        start = 0
        stop = start + batch_size
        while start < bucket_data.shape[0]:
            seq_len = bucket_sizes[bx][1]
            section = bucket_data[start:stop]
            xdata = seq_data_matrix(section.preprotweet, max_len=seq_len)
            ydata = section.label
            if gpu == True:
                yield Variable(torch.FloatTensor(xdata).cuda(), requires_grad=True), Variable(torch.LongTensor(ydata)).cuda()
            else:
                yield Variable(torch.FloatTensor(xdata), requires_grad=True), Variable(torch.LongTensor(ydata))
            
            start = stop
            stop = start + batch_size
    

### Model of LSTM

In [66]:
class SentModel(nn.Module):
    def __init__(self, in_shape=None, out_shape=None, hidden_shape=None):
        super(SentModel, self).__init__()
        self.in_shape = in_shape
        self.out_shape = out_shape
        self.hidden_shape = hidden_shape
        self.n_layers = 1
        
        self.rnn = nn.LSTM(
                        input_size = self.in_shape,
                        hidden_size = self.hidden_shape,
                        num_layers = self.n_layers,
                        batch_first = True
        )
        self.lin = nn.Linear(self.hidden_shape, 64)
        self.dropout = nn.Dropout(0.42)
        self.out = nn.Linear(64, self.out_shape)
        
        
    def forward(self, x, h):
        r_out, h_state = self.rnn(x, h)
        last_out = r_out[:, -1, :]
        y = f.tanh(self.lin(last_out))
        y = self.dropout(y)
        y = f.softmax(self.out(y))
        return y
    
    def predict(self, x):
        h_state = self.init_hidden(1)    
        x = sequence_to_data(x)
        pred = self.forward(torch.FloatTensor(x), h_state)
        return pred
    
    def get_embedding(self, x):
        h_state = self.init_hidden(1, gpu=False)
        
        x = sequence_to_data(x)
        r_out, h = self.rnn(torch.FloatTensor(x), h_state)
        last_out = r_out[:, -1, :]
        
        return last_out.data.numpy()
    
    def init_hidden(self, batch_size):
        return (Variable(torch.zeros(self.n_layers, batch_size, self.hidden_shape)),
                Variable(torch.zeros(self.n_layers, batch_size, self.hidden_shape)))

In [67]:
model = SentModel(in_shape=384, hidden_shape=256, out_shape=2)

print(model)

SentModel(
  (rnn): LSTM(384, 256, batch_first=True)
  (lin): Linear(in_features=256, out_features=64, bias=True)
  (dropout): Dropout(p=0.42)
  (out): Linear(in_features=64, out_features=2, bias=True)
)


In [57]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.0003)
criterion = nn.CrossEntropyLoss()

In [60]:
for epoch in range(50):
    total_loss = 0
    N = 0
    for step, (b_x, b_y) in enumerate(make_batch(df, batch_size=200)):
        bsize = b_x.size(0)
        
        h_state = model.init_hidden(bsize)

        pred = model(b_x, h_state)
        loss = criterion(pred, b_y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss
        N += 1.0
        if step%2 == 0:
            print('Loss: {} at Epoch: {} | Step: {}'.format(loss, epoch, step))
        
    print("Overall Average Loss: {} at Epoch: {}".format(total_loss / float(N), epoch))
    
 

torch.save(model.state_dict(), "model_256h_epoch_{}.ckpt".format(epoch))



Loss: 0.727124214172 at Epoch: 0 | Step: 0
Loss: 0.656556487083 at Epoch: 0 | Step: 2
Loss: 0.59147053957 at Epoch: 0 | Step: 4
Loss: 0.535307586193 at Epoch: 0 | Step: 6
Loss: 0.466973215342 at Epoch: 0 | Step: 8
Loss: 0.414955943823 at Epoch: 0 | Step: 10
Loss: 0.381862610579 at Epoch: 0 | Step: 12
Loss: 0.357407063246 at Epoch: 0 | Step: 14
Loss: 0.361554533243 at Epoch: 0 | Step: 16
Loss: 0.345578581095 at Epoch: 0 | Step: 18
Loss: 0.338868021965 at Epoch: 0 | Step: 20
Loss: 0.339243292809 at Epoch: 0 | Step: 22
Loss: 0.356450259686 at Epoch: 0 | Step: 24
Loss: 0.354683995247 at Epoch: 0 | Step: 26
Loss: 0.348700225353 at Epoch: 0 | Step: 28
Loss: 0.350509762764 at Epoch: 0 | Step: 30
Loss: 0.348987787962 at Epoch: 0 | Step: 32
Overall Average Loss: 0.427442103624 at Epoch: 0
Loss: 0.353876769543 at Epoch: 1 | Step: 0
Loss: 0.322907477617 at Epoch: 1 | Step: 2
Loss: 0.322574436665 at Epoch: 1 | Step: 4
Loss: 0.317683547735 at Epoch: 1 | Step: 6
Loss: 0.326995044947 at Epoch: 1 | St

KeyboardInterrupt: 

In [61]:
torch.save(model.state_dict(), "model_256h_lstm.ckpt")

In [62]:
model.eval()

SentModel(
  (rnn): LSTM(384, 256, batch_first=True)
  (lin): Linear(in_features=256, out_features=64, bias=True)
  (dropout): Dropout(p=0.42)
  (out): Linear(in_features=64, out_features=2, bias=True)
)

In [69]:
model.load_state_dict(torch.load("model_256h_lstm.ckpt"))

In [78]:
model.predict('indianlovestori never let anyon hurt throw acid face make point india love acid')



tensor([[ 0.9968,  0.0032]])