# Sentiment Analysis

In [None]:
!pip install -U nltk scikit-learn pandas matplotlib numpy wordcloud

## Read the csv

In [None]:
import pandas as pd

reviews = pd.read_csv('./files/imdb-reviews.csv')
reviews['sentiment'] = reviews['sentiment'].map({'neg': 0, 'pos': 1})
reviews.drop(columns=["text_pt", "id"], inplace=True)
reviews.head(10)

# Preprocessing

In [None]:
import re
from nltk.tokenize import WhitespaceTokenizer

### 1. Cleaning:
    Clear and get only the main part from the dataset
    Ex: remove the tags of the html.
    Ex: filter the texts in PDF and etc.

In [None]:
def clean(text):
    # Remove the HTML tags
    text = re.sub("<!--?.*?-->","",text)
    text = re.sub("<.*?>","",text)
    
    return text

### 2. Normalization:
    Remove the pontuation, tags, put everything in same case and etc.

In [None]:
def normalize(text):
    # Convert to lower case
    text = text.lower()
    
    # remove special characters and digits
    text = re.sub("(\\d|\\W)+", " ",text)
    
    text = text.replace('  ', ' ')
    
    return text

### 3. Tokenization:
    Split the text in words spliting by the whitespaces.

In [None]:
def tokenizer(text):    
    tokenizer = WhitespaceTokenizer()
    tokens = tokenizer.tokenize(text)
    
    return tokens

### 4. Stop Words:
    They are words witch don't get no one meaning, they are just used to complement the context,
    and to connect the terms.
    Ex: 'i', 'you', 'in', 'out', 'are', 'the'

In [None]:
from nltk.corpus import stopwords

# Remove the stop words, they are words witch don't give no one especific meaning
def remove_stopwords(tokens):
    return [w for w in tokens if w not in stopwords.words("english")]

### 5. Stemming:
    Takes of the variation of the words and remove the finally to combine than.
    Ex: 'change', 'changing', 'changes' => 'chang'


### 6. Lemmatization:
    Takes the variation of the same word and convert to the same one (Noun).
    Ex: 'is', 'were', 'was' => 'be'
    Ex: 'ones' => 'one'

    Part of Speech(PoS) (Verb):
    Ex: 'bored' => 'bore'
    Ex: 'stating' => 'start'

In [None]:
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

# Remove the pural
# Remove the verb conjugation
def stem(words):
    return [PorterStemmer().stem(w) for w in words]

# Remove the personality
def lem(words):
    return [WordNetLemmatizer().lemmantize(w, ) for w in words]

### 7. Tag filtering
    Filter the words according with the sintaxe definition like a noun, verbs, adverbs e etc.

In [None]:
import nltk

# RB | RB | JJ | NN | NNP | JJ | JJS | IN | VB | VBZ | VBD | VBG

# IN = preposition/subordinating conjunction

# RB = adverb very, silently
# RBR = adverb, comparative better
# RBS = adverb, superlative best
# RP = particle give up

# IN = preposition/subordinating conjunction
# JJ = adjective ‘big’
# JJR = adjective, comparative ‘bigger’
# JJS = adjective, superlative ‘biggest’

# VB verb, base form take
# VBD verb, past tense took
# VBG verb, gerund/present participle taking
# VBN verb, past participle taken
# VBP verb, sing. present, non-3d take
# VBZ verb, 3rd person sing. present takes

# Filter using regular array
def filter_tokens(tokens):
    tags = [x[1] for x in nltk.pos_tag(list(tokens))]
    filters = ("RB", "RBR", "RBS", "RP", "JJ", "JJR", "JJS", "JJ", "VB")
    
    return [tokens[i] for i in range(len(tokens)) if tags[i] in filters]

# Bag of Words

In [None]:
all_words = " ".join(list(reviews.text_en[:1000]))

tokens = tokenizer(normalize(clean(all_words)))
print("> Tokenized!")

tokens = remove_stopwords(tokens)
print("> Removed the stop words!")

tokens = stem(tokens)
# tokens = lem(tokens)
print("> Merged the term by stem or lem!")

tokens = filter_tokens(tokens)
print("> Filtred by tags witch get more meaning!")

print("\nColection [:100]:\n")
print(tokens[:100])

### Get the frequency of the words

In [None]:
import nltk

frequency = nltk.FreqDist(tokens)

# Create the bag of words dataframe
bag_of_words = pd.DataFrame({"words": list(frequency.keys()), "frequency": list(frequency.values())})

# Order by the Frequency
bag_of_words.sort_values(by="frequency", ascending=False, inplace=True)
bag_of_words.reset_index(drop=True, inplace=True)

# Save the bag of words
bag_of_words.to_csv('./files/bag-of-words.csv', index=True)

print(f"Back of words size: {bag_of_words.shape[0]}")

print(bag_of_words.shape[0])
bag_of_words.head(5)

### Plot the frequency in Word Cloud

In [None]:
def to_single_str(words, frequency):
    words = list(words)
    frequency = list(frequency)
    
    return " ".join([(words[i] + " ") * frequency[i] for i in range(len(frequency))])

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

word_cloud = WordCloud(width=800, height=500, max_font_size=110, collocations=False).generate(to_single_str(bag_of_words["words"], bag_of_words["frequency"]))

plt.figure(figsize=(13, 13))
plt.imshow(word_cloud)
plt.show()

# Feature Selection

## TF-IDF

### Configs

In [None]:
# Load the dictinary
bag_of_words = pd.read_csv('./files/bag-of-words.csv')
bag_of_words_array = bag_of_words.words.values

# Get the inputs
reviews = pd.read_csv('./files/imdb-reviews.csv')
reviews['sentiment'] = reviews['sentiment'].map({'neg': 0, 'pos': 1})
reviews.drop(columns=["text_pt", "id"], inplace=True)

inputs = reviews.text_en.values

### Implementation

In [None]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer


def tf_idf(txt, vocabulary=None):
    txt = list(txt)

    tf = TfidfVectorizer(smooth_idf=False, sublinear_tf=False, norm=None, analyzer='word', vocabulary=vocabulary)
    txt_transformed = tf.fit(txt).transform(txt)

    return pd.DataFrame(txt_transformed.toarray(), columns=tf.get_feature_names())

In [None]:
tfidf = tf_idf(inputs, bag_of_words_array)
tfidf.head(10)

# Modeling

In [None]:
from sklearn.model_selection import train_test_split

x = tfidf.values
y = [[x] for x in reviews.sentiment.values]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0, stratify = y)

### Dummy Classifier

In [None]:
from sklearn.dummy import DummyClassifier

model = DummyClassifier()
model.fit(x_train, y_train)

accuracy = model.score(x_test, y_test) * 100
print("Taxa de acerto do algoritimo de Base line: %.2f%%" % accuracy)

### Linear SVC

In [None]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
import numpy as np

np.random.seed(5)

# Test a linear model
model = LinearSVC()
model.fit(x_train, y_train)

accuracy = model.score(x_test, y_test) * 100
print("Linear SVC accuracy: %.2f%%" % accuracy)