In [1]:
# downloading data
import nltk
nltk.download('subjectivity')
nltk.download('movie_reviews')
nltk.download('vader_lexicon')

[nltk_data] Downloading package subjectivity to
[nltk_data]     C:\Users\adnan\AppData\Roaming\nltk_data...
[nltk_data]   Package subjectivity is already up-to-date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\adnan\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\adnan\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [2]:
# all imports
from nltk.corpus import subjectivity
from nltk.corpus import movie_reviews
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import f1_score
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [3]:
# get all sents and associate to them their subjectivity
all_sents = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')] + \
            [(sent, 'obj') for sent in subjectivity.sents(categories='obj')]

In [4]:
# convert sentences into string
sentences = [' '.join(sent) for (sent, label) in all_sents]

# defining labels
labels = [1 if label == 'subj' else 0 for (sent, label) in all_sents]

In [5]:
vectorizer = TfidfVectorizer()

In [6]:
train_features = vectorizer.fit_transform(sentences)

In [7]:

# defining the MLP model
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

In [8]:
def train_model(model, criterion, optimizer, train_features, train_labels, epochs=5):
    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        outputs = model(train_features)
        loss = criterion(outputs, train_labels)
        loss.backward()
        optimizer.step()
        
        if(epoch + 1) % 10 == 0:
             print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

In [9]:
num_folds = 10
scores = []
f1_scores = []
# init model
input_size = train_features.shape[1]
hidden_size = 180
output_size = 2 # 0 or 1
lr = 0.0004
epochs = 100

#device = torch.device("cuda")

skf = StratifiedKFold(n_splits=num_folds, shuffle=True)

for (train_index, test_index) in skf.split(train_features, labels):
    # represents the folds
    x_train, x_test = train_features[train_index], train_features[test_index]
    y_train, y_test = [labels[i] for i in train_index], [labels[i] for i in test_index]
    
    x_train = torch.tensor(x_train.toarray(), dtype=torch.float32) #.to(device)
    y_train = torch.tensor(y_train) #.to(device)
 
    
    
    model = MLP(x_train.shape[1], hidden_size, output_size) #.to(device)
    
    criterion = nn.CrossEntropyLoss() #.to(device)
    optimizer = optim.Adam(model.parameters(), lr)
    
    # train the model
    train_model(model, criterion, optimizer, x_train, y_train)
    
    # evaluate
    model.eval()
    with torch.no_grad():
        x_test = torch.tensor(x_test.toarray(), dtype=torch.float32) #.to(device)
        y_test = torch.tensor(y_test) #.to(device)
        outputs = model(x_test)
        _, predicted = torch.max(outputs.data, 1)
       # predicted = predicted.cpu()
        #y_test = y_test.cpu()
        f1 = f1_score(y_test, predicted, average='macro')
        scores.append(f1)

# Print average F1 score
average_f1 = sum(scores) / num_folds
print('Average F1 Score:', round(average_f1, 3))

Average F1 Score: 0.438


In [10]:
# SECOND PART BEGINS HERE (WITHOUT REMOVING OBJ SENTS)

In [11]:
import numpy

def lol2str(doc):
    # flatten & join
    return " ".join([w for sent in doc for w in sent])

In [12]:
rev_neg = movie_reviews.paras(categories="neg")
rev_pos = movie_reviews.paras(categories="pos")

In [13]:
from nltk.sentiment.util import mark_negation
new_neg = []
for rev in rev_neg:
    new_rev = []
    for sentence in rev:
        new_rev.append(mark_negation(sentence)) # Apply or not the negation
    new_neg.append(new_rev)
    
new_pos = []
for rev in rev_pos:
    new_rev = []
    for sentence in rev:
        new_rev.append(mark_negation(sentence)) # Apply or not the negation
    new_pos.append(new_rev)   

In [14]:
rev_corpus = [lol2str(d) for d in new_neg] + [lol2str(d) for d in new_pos]
rev_labels = ([0] * len(rev_neg) + [1] * len(rev_pos))
rev_vectors = vectorizer.fit_transform(rev_corpus)

In [15]:
num_folds = 10
scores = []
f1_scores = []
# init model

hidden_size = 180
output_size = 2 # 0 or 1
lr = 0.0004
epochs = 5

#device = torch.device("cuda")

skf = StratifiedKFold(n_splits=num_folds, random_state=42, shuffle=True)

for (train_index, test_index) in skf.split(rev_vectors, rev_labels):
    # represents the folds
    x_train, x_test = rev_vectors[train_index], rev_vectors[test_index]
    y_train, y_test = [rev_labels[i] for i in train_index], [rev_labels[i] for i in test_index]
    
    x_train = torch.tensor(x_train.toarray(), dtype=torch.float32) #.to(device)
    y_train = torch.tensor(y_train) #.to(device)
 
    model = MLP(x_train.shape[1], hidden_size, output_size) #.to(device)
    
    criterion = nn.CrossEntropyLoss() #.to(device)
    optimizer = optim.Adam(model.parameters(), lr)
    
    # train the model
    train_model(model, criterion, optimizer, x_train, y_train)
    
    # evaluate
    model.eval()
    with torch.no_grad():
        x_test = torch.tensor(x_test.toarray(), dtype=torch.float32) #.to(device)
        y_test = torch.tensor(y_test) #.to(device)
        outputs = model(x_test)
        _, predicted = torch.max(outputs.data, 1)
       # predicted = predicted.cpu()
        #y_test = y_test.cpu()
        f1 = f1_score(y_test, predicted, average='macro')
        scores.append(f1)

# Print average F1 score
average_f1 = sum(scores) / num_folds
print('Average F1 Score:', round(average_f1, 3))

Average F1 Score: 0.54


In [16]:

# FROM HERE WITHOUT OBJ JUDGMENTS

In [17]:
analyzer = SentimentIntensityAnalyzer()

In [18]:
def rm_objective_sentences(document, analyzer):
    new_doc = []
    for sentence in document:
        value = analyzer.polarity_scores(" ".join(sentence))
        if value["compound"] != 0: # Add into new_doc the sentences with a polarity
            new_doc.append(" ".join(sentence))
    return new_doc

rev_neg_wo_objective = [" ".join(rm_objective_sentences(doc, analyzer)) for doc in rev_neg]
rev_pos_wo_objective = [" ".join(rm_objective_sentences(doc, analyzer)) for doc in rev_pos]
corpus_wo_objective = rev_neg_wo_objective + rev_pos_wo_objective
wo_obj_vectors = vectorizer.fit_transform(corpus_wo_objective)

In [19]:
num_folds = 10
scores = []
f1_scores = []
# init model

hidden_size = 180
output_size = 2 # 0 or 1
lr = 0.0004
epochs = 5

#device = torch.device("cuda")

skf = StratifiedKFold(n_splits=num_folds, random_state=42, shuffle=True)

for (train_index, test_index) in skf.split(wo_obj_vectors, rev_labels):
    # represents the folds
    x_train, x_test = rev_vectors[train_index], rev_vectors[test_index]
    y_train, y_test = [rev_labels[i] for i in train_index], [rev_labels[i] for i in test_index]
    
    x_train = torch.tensor(x_train.toarray(), dtype=torch.float32) #.to(device)
    y_train = torch.tensor(y_train) #.to(device)
 
    model = MLP(x_train.shape[1], hidden_size, output_size) #.to(device)
    
    criterion = nn.CrossEntropyLoss() #.to(device)
    optimizer = optim.Adam(model.parameters(), lr)
    
    # train the model
    train_model(model, criterion, optimizer, x_train, y_train)
    
    # evaluate
    model.eval()
    with torch.no_grad():
        x_test = torch.tensor(x_test.toarray(), dtype=torch.float32) #.to(device)
        y_test = torch.tensor(y_test) #.to(device)
        outputs = model(x_test)
        _, predicted = torch.max(outputs.data, 1)
       # predicted = predicted.cpu()
        #y_test = y_test.cpu()
        f1 = f1_score(y_test, predicted, average='macro')
        scores.append(f1)

# Print average F1 score
average_f1 = sum(scores) / num_folds
print('Average F1 Score:', round(average_f1, 3))

Average F1 Score: 0.563
