<a href="https://colab.research.google.com/github/alessiomongoli/Sentiment_Lexicon/blob/main/Data_induction/PreProcessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
nltk.download('punkt')
from spacy.symbols import neg
from nltk.tokenize import sent_tokenize
from nltk import RegexpTokenizer
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import gzip
import json
import time
import spacy
import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# Read and load the catagories
def parseAmazon(path):
    g = gzip.open(path, 'rb')
    for line in g:
      yield json.loads(line)    

def preprocessing(path, negation_type, category): 
    start = time.time()
    reviews = []
    labels  = []           

    for index, review in enumerate(parseAmazon(path)):
          if review["overall"] != 3.0:
              if negation_type == 'normal':
                  neg_review = normal_negation(review["reviewText"])
              elif negation_type == 'all_words':
                  neg_review = all_words_negation(review["reviewText"])
              else :
                neg_review = no_negation(review["reviewText"])
              if review["overall"] < 3.0:    
                  label = -1
              else:
                  label = +1
              reviews.append(neg_review)
              labels.append(label)

    y = np.array(labels)
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(reviews)
    vocabulary = vectorizer.vocabulary_
    frequencies = X.sum(axis=0)
    frequencies = np.asarray(frequencies)[0]
    end = time.time()
    with open('/content/drive/MyDrive/Github/Colab Notebooks/project/Results/time.txt', 'a') as f:
      f.writelines('\n'+category+'_'+negation_type+' Preprocessing phase: '+str(end-start)+' seconds')
      f.close()
    return X, y, frequencies, vocabulary

In [None]:
def preprocessing_experiment_price(df, negation_type): # experiment
    reviews = []
    labels  = []           

    for index, review in df.iterrows():
        if negation_type == 'normal':
          neg_review = normal_negation(review["reviewText"])
        elif negation_type == 'all_words':
          neg_review = all_words_negation(review["reviewText"])
        else:
          neg_review = no_negation(review["reviewText"])
        if review["label"] < 3.0:
          label = -1
        else:
          label = +1
        reviews.append(neg_review)
        labels.append(label)

    y = np.array(labels)
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(reviews)
    vocabulary = vectorizer.vocabulary_
    frequencies = X.sum(axis=0)
    frequencies = np.asarray(frequencies)[0]
    return X, y, frequencies, vocabulary

In [None]:
def decontracted(review):
    review = re.sub(r"isn\'t", "is not", review)
    review = re.sub(r"won\'t", "will not", review)
    review = re.sub(r"can\'t", "can not", review)
    review = re.sub(r"wasn\'t", "was not", review)
    review = re.sub(r"weren\'t", "were not", review)
    review = re.sub(r"aren\'t", "are not", review)
    review = re.sub(r"couldn\'t", "could not", review)
    review = re.sub(r"don\'t", "do not", review)
    review = re.sub(r"didn\'t", "did not", review)
    review = re.sub(r"doesn\'t", "does not", review)
    review = re.sub(r"haven\'t", "have not", review)
    review = re.sub(r"hadn\'t", "had not", review)
    return review

In [None]:
def normal_negation(review):    
    tokenizer = RegexpTokenizer(r'\w+')
    review = decontracted(review)    
    tokens = tokenizer.tokenize(review)
    clean_review = ''
    for i, token in enumerate(tokens):
        if token == 'not' and i != len(tokens) - 1:
            tokens[i + 1] = 'NEG_' + tokens[i + 1]
        else:
            if token.startswith('NEG_'):
                clean_review = clean_review + ' ' + token
            else:
                clean_review = clean_review + ' ' + token.lower()
    return clean_review.strip()


def no_negation(review):
    tokenizer = RegexpTokenizer(r'\w+')
    review = decontracted(review)    
    result = list()
    for sent in sent_tokenize(review):
        tokens = tokenizer.tokenize(sent)
        tokens = [t.lower() for t in tokens]
        result.extend(tokens)
    return ' '.join(result)


def all_words_negation(review):
    tokenizer = RegexpTokenizer(r'\w+')
    review = decontracted(review)    
    result = list()
    for sent in sent_tokenize(review):
        tokens = tokenizer.tokenize(sent)
        tokens = [t.lower() for t in tokens]
        for i, token in enumerate(tokens):
            if token == 'not' and i != len(tokens) - 1:
                for j in range(i + 1, len(tokens)):
                    tokens[j] = 'NEG_' + tokens[j]
                break
        result.extend(tokens)
    return ' '.join(result)