# MIDAS Summer Internship Task
### Problem 2: NLP Problem
**Suggestion Mining**

In [1]:
import pandas as pd
import numpy as np
import csv
import os
import gensim
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.models import FastText
from nltk.tokenize import sent_tokenize, word_tokenize
import logging

In [2]:
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)

***Creating word embeddings***

In [3]:
y_train = []
x_train = []
sentences = []
with open('data/V1.4_Training.csv', 'r') as training_file:
    for line in csv.reader(training_file, delimiter=','):
        y_train.append(int(line[2]))
        sentences.append(line[1].strip('"'))

In [4]:
len(y_train)

8500

In [5]:
len(list(filter(lambda x: x == 1, y_train)))

2085

In [6]:
len(list(filter(lambda x: x == 0, y_train)))

6415

In [7]:
for i in sentences:
    temp = []
    
    for j in word_tokenize(i):
        temp.append(j.lower())
    
    x_train.append(temp)

In [8]:
if not os.path.isdir('embedding'):
    w2v = Word2Vec(x_train, sg=1, size=100, min_count=1, window=10, workers=2, iter=10)
    ft = FastText(x_train, size=100, window=10, min_count=1, iter=10)
    os.mkdir('embedding')
    w2v.wv.save_word2vec_format('embedding/w2v')
    ft.save('embedding/ft')
    wv = w2v.wv
    del w2v

else:
    wv = KeyedVectors.load_word2vec_format('embedding/w2v')
    ft = FastText.load('embedding/ft')

INFO : loading projection weights from embedding/w2v
INFO : loaded (11220, 100) matrix from embedding/w2v
INFO : loading FastText object from embedding/ft
INFO : loading wv recursively from embedding/ft.wv.* with mmap=None
INFO : loading vectors_ngrams from embedding/ft.wv.vectors_ngrams.npy with mmap=None
INFO : setting ignored attribute vectors_norm to None
INFO : setting ignored attribute vectors_vocab_norm to None
INFO : setting ignored attribute vectors_ngrams_norm to None
INFO : setting ignored attribute buckets_word to None
INFO : loading vocabulary recursively from embedding/ft.vocabulary.* with mmap=None
INFO : loading trainables recursively from embedding/ft.trainables.* with mmap=None
INFO : loading vectors_ngrams_lockf from embedding/ft.trainables.vectors_ngrams_lockf.npy with mmap=None
INFO : loaded embedding/ft


***Using Linear SVC***

In [9]:
from sklearn import svm
from sklearn import metrics

In [10]:
x_train_v = np.array([np.array(list(map(lambda x: wv[x], x))) for x in x_train])
x_train_v = np.array([np.sum(i, axis=0) for i in x_train_v])

y_train = np.array(y_train)

print(x_train_v.shape)
print(y_train.shape)

(8500, 100)
(8500,)


In [11]:
svc = svm.SVC(kernel='linear')
svc.fit(x_train_v, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [12]:
y_test = []
x_test = []
sentences = []
# test = pd.read_csv('data/SubtaskA_Trial_Test_Labeled.csv', encoding='latin-1')
with open('data/SubtaskA_Trial_Test_Labeled.csv', 'r', encoding='ISO-8859-1') as testing_file:
    next(testing_file)
    for line in csv.reader(testing_file, delimiter=','):
        y_test.append(int(line[2]))
        sentences.append(line[1].strip('"'))
        
for i in sentences:
    temp = []
    
    for j in word_tokenize(i):
        temp.append(j.lower())
    
    x_test.append(temp)

In [13]:
ft = FastText(x_test, size=100, window=10, min_count=1, iter=10)

INFO : collecting all words and their counts
INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO : collected 2279 word types from a corpus of 12090 raw words and 592 sentences
INFO : Loading a fresh vocabulary
INFO : min_count=1 retains 2279 unique words (100% of original 2279, drops 0)
INFO : min_count=1 leaves 12090 word corpus (100% of original 12090, drops 0)
INFO : deleting the raw counts dictionary of 2279 items
INFO : sample=0.001 downsamples 48 most-common words
INFO : downsampling leaves estimated 8680 word corpus (71.8% of prior 12090)
INFO : estimated required memory for 2279 words, 21962 buckets and 100 dimensions: 12234916 bytes
INFO : resetting layer weights
INFO : Total number of ngrams is 21962
INFO : training model with 3 workers on 2279 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=10
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of 1 more threads


In [14]:
x_test_v = np.array([np.array(list(map(lambda x: ft.wv[x], x))) for x in x_test])
x_test_v = np.array([np.sum(i, axis=0) for i in x_test_v])
y_test = np.array(y_test)

In [16]:
print(svc.score(x_test_v, y_test))

0.5


___Training the LSTM___

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [None]:
inputs = np.array([np.array(list(map(lambda x: wv.get_vector(x), x))) for x in data])

In [None]:
inputs = [np.sum(i, axis=0) for i in inputs]
inputs = torch.Tensor(inputs)

In [None]:
labels = torch.Tensor(labels).view(-1, 1)
print(inputs.size())
print(labels.size())

In [None]:
class Model(nn.Module):
    
    def __init__(self, input_size, hidden_size, output_size):
        super(Model, self).__init__()
        
        self.hidden_size = hidden_size
        
        self.fc1 = nn.Linear(input_size + hidden_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(input_size + hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, input_, hidden):
        combined = torch.cat([input_, hidden], 0)
        a1 = self.fc1(combined)
        a2 = self.relu(a1)
        a3 = self.fc2(a2)
        output = self.sigmoid(a3)
        
        return output
    
    def init_hidden(self):
        return torch.zeros(1, self.hidden_size)
    
i_size = 100
h_size = 100
o_size = 1

model = Model(i_size, h_size, o_size)
opt = optim.SGD(model.parameters(), lr=0.001)
criterion = nn.BCELoss()

In [None]:
for batch in range(0, len(inputs) + 1, 200):
    hidden = model.init_hidden()
    output = model(inputs[0:batch], hidden)
    loss = criterion(output, labels)

In [None]:
# criterion = nn.BCELoss()
# opt = optim.SGD(model.parameters(), lr=0.001)

# def train(label, inp):
#     hidden = model.init_hidden()
    
#     opt.zero_grad()
    
#     for i in inp:
#         output, hidden = model(i.view(1, -1), hidden)
    
#     loss = criterion(output, label)
#     loss.backward()
#     opt.step()

In [None]:
# import time
# import math

# n_iter = 10000
# print_every = 500
# plot_every = 100

# current_loss = 0
# all_losses = []

# def time_since(since):
#     now = time.time()
#     s = now - since
#     m = math.floor(s / 60)
#     s -= m * 60
#     return '%dm %ds' % (m, s)

# start = time.time()

# x_train, y_train = inputs[0], labels[0]

# for x in range(0, n_iter):
#     output, loss = train(y_train, x_train)
#     current_loss += loss
    
#     if x % print_every == 0:
#         print('%d %d%% (%s) %.4f' % (x, x / n_iter * 100, time_since(start), loss))
        
#     if x % plot_every == 0:
#         all_losses.append(current_loss / plot_every)
#         current_loss = 0

In [None]:
# import matplotlib.pyplot as plt
# import matplotlib.ticker as ticker
# %matplotlib inline

# plt.plot(all_losses)
# plt.show()

In [None]:
x_train = inputs[0]
y_train = labels[0]

In [None]:
hidden = model.init_hidden()

In [None]:
for x in range(50):
    for i in inputs[:10]:
        for j in i:
            output, hidden = model(j.view(1, -1), hidden)

    loss = criterion(output.squeeze(1), y_train)
    print(loss.item())
    loss.backward(retain_graph=True)
    opt.step()

In [None]:
count = 0
for i, y in zip(inputs, labels):
    for x in i:
        output, hidden = model(x.view(1, -1), hidden)
    if y == 1 and output > 0.82:
        count += 1
    elif y == 0 and output <= 0.82:
        count += 1

In [None]:
count