In [1]:
import torch
# import torchtext
import torch.nn as nn
import torch.nn.functional as F
torch.manual_seed(42)

import pandas as pd
import numpy as np
import nltk 
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn import preprocessing
from sklearn import utils
import re 

# from data_cleaning import preprocess

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Avyakta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# reading training data
data = pd.read_csv('data_twitter_sentiment/semeval_train.txt',sep='\t',names=["sentiment","tweet"])
# data.head()
# reading and preparing test data
dt1 = pd.read_csv('data_twitter_sentiment/Twitter2013_raw.txt',sep='\t',names=["sentiment","tweet"])
dt2 = pd.read_csv('data_twitter_sentiment/Twitter2014_raw.txt',sep='\t',names=["sentiment","tweet"])
dt3 = pd.read_csv('data_twitter_sentiment/Twitter2015_raw.txt',sep='\t',names=["sentiment","tweet"])
dt4 = pd.read_csv('data_twitter_sentiment/Twitter2016_raw.txt',sep='\t',names=["sentiment","tweet"])

# dt = pd.concat([dt1, dt2, dt3, dt4])
dt = dt2

In [3]:
def preprocess(features):
    processed_features = []
    for sentence in range(0, len(features)):
        # remove hyperlinks, tags, hashtags
        processed_feature = ' ' + str(features[sentence]) + ' '
        processed_feature = re.sub(r'http*\S+', ' ', processed_feature) 
        processed_feature = re.sub(r'https*\S+', ' ', processed_feature)
        processed_feature = re.sub(r'@\S+', ' ', processed_feature)
        processed_feature = re.sub(r'#\S+', ' ', processed_feature)
        processed_feature = re.sub(r'\bhm*\s+', '', processed_feature)
        
        # remove all digits
        processed_feature = re.sub(r'[0-9]', ' ', processed_feature)
        processed_feature = re.sub(r'[_]', ' ', processed_feature)
        
        # Remove all the special characters
        processed_feature = re.sub(r'\W', ' ', processed_feature)
        
        # remove all single characters
        processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)

        # Remove single characters from the start
        processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 

        # Substituting multiple spaces with single space
        processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)

        # Converting to Lowercase
        processed_feature = processed_feature.lower()

        # remove 
        processed_feature = re.sub(r'(\s)aa\w+', ' ', processed_feature)
        processed_feature = re.sub(r'(\s)ba(\s)', ' ', processed_feature)
        processed_feature = re.sub(r'(\s)th(\s)', ' ', processed_feature)

        processed_features.append(processed_feature)
    return processed_features

In [4]:
# from sklearn.feature_selection import SelectKBest, chi2

# data cleaning
train_features = preprocess(data.iloc[:, 1].values)
test_features = preprocess(dt.iloc[:, 1].values)

# vectorizing data
vectorizer = TfidfVectorizer(stop_words='english', min_df=2, max_df=0.95, norm = 'l2',ngram_range=(1, 4)).fit(train_features)
# v = vectorizer
cv_array = vectorizer.transform(train_features)
cvt_array = vectorizer.transform(test_features)

print( cv_array.shape, sep='\n')

# label encoding
lab_enc = preprocessing.LabelEncoder().fit(data['sentiment'])

# from sklearn.model_selection import train_test_split

# x_train, x_test,label_train, label_test = train_test_split(cv_array, data['sentiment'], test_size=0.2, random_state=0)
x_train = cv_array
label_train = data['sentiment']
y_train = lab_enc.transform(label_train)

x_test = cvt_array
label_test = dt['sentiment']
y_test = lab_enc.transform(label_test)

(8588, 17797)


In [5]:
import scipy
x_train = torch.tensor(scipy.sparse.csr_matrix.todense(x_train)).float()
x_test = torch.tensor(scipy.sparse.csr_matrix.todense(x_test)).float()
y_train = torch.tensor(y_train).type(torch.LongTensor)
y_test = torch.tensor(y_test).type(torch.LongTensor)

In [20]:
model = nn.Sequential(
             nn.Linear(x_train.shape[1], 128),
             nn.ReLU(),
             nn.Linear(128, 128),
            #  nn.ReLU(),
            #  nn.Linear(128, 128),
             nn.Linear(128, 3),
             nn.LogSoftmax(dim=1))
# Define the loss
criterion = nn.NLLLoss()
# Forward pass, log  
logps = model(x_train)
# Calculate the loss with the logits and the labels
loss = criterion(logps, y_train)
loss.backward()
# Optimizers need parameters to optimize and a learning rate
optimizer = torch.optim.Adam(model.parameters(), lr=0.1, weight_decay = 0.009)

epochs = 201
for e in range(epochs):
    optimizer.zero_grad()
    output = model.forward(x_train)
    loss = criterion(output, y_train)
    loss.backward()
    optimizer.step()

    
    if e % 50 == 0:
        with torch.no_grad():
            model.eval()
            log_ps = model(x_test)
            test_loss = criterion(log_ps, y_test)
            ps = torch.exp(log_ps)
            top_p, top_class = ps.topk(1, dim=1)
            equals = top_class == y_test.view(*top_class.shape)
            test_accuracy = torch.mean(equals.float())

            ps = torch.exp(output)
            top_p, top_class = ps.topk(1, dim=1)
            equals = top_class == y_train.view(*top_class.shape)
            train_accuracy = torch.mean(equals.float())
        print("Train loss is",loss.detach(), " and train accuracy is", train_accuracy)
        print("Test  loss is",test_loss, " and test  accuracy is", test_accuracy)
        print('\n')
# evaluation after every 50 epochs

Train loss is tensor(1.1027)  and train accuracy is tensor(0.1469)
Test  loss is tensor(1.0028)  and test  accuracy is tensor(0.3612)


Train loss is tensor(0.7531)  and train accuracy is tensor(0.6701)
Test  loss is tensor(0.8289)  and test  accuracy is tensor(0.5956)


Train loss is tensor(0.6534)  and train accuracy is tensor(0.7111)
Test  loss is tensor(0.8593)  and test  accuracy is tensor(0.6042)


Train loss is tensor(0.5398)  and train accuracy is tensor(0.8331)
Test  loss is tensor(0.8794)  and test  accuracy is tensor(0.6204)


Train loss is tensor(0.5116)  and train accuracy is tensor(0.8347)
Test  loss is tensor(0.8976)  and test  accuracy is tensor(0.6188)




In [7]:
model(x_train)

tensor([[-1.5652, -1.3220, -0.6456],
        [-1.5592, -0.7301, -1.1782],
        [-1.6930, -1.8325, -0.4216],
        ...,
        [-2.0599, -0.2250, -2.6035],
        [-1.9571, -2.5760, -0.2451],
        [-1.6469, -1.6752, -0.4779]], grad_fn=<LogSoftmaxBackward>)