In [465]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [466]:
df = pd.read_csv('./reviews_fb.csv')
print(df.shape)

(108, 2)


In [467]:
# pre processing 1 - remove html tags
def clean_html(text):
    bsp = BeautifulSoup(text, "lxml")
    return bsp.get_text()

def normalize_labels(x):
    if (x >= 3):
        return 'compliment'
    return 'complaint'
    


df['text'] = df['text'].apply(clean_html)
df['rating'] = df['rating'].apply(normalize_labels)

In [468]:
#pre processing 2 - tokenize, stemm
def normalize_text(text):
    stemmer = RSLPStemmer()
    words = word_tokenize(text, 'portuguese')
    return ' '.join([stemmer.stem(w) for w in words if w not in stopwords.words('portuguese')])

df['text'] = df['text'].apply(normalize_text)

In [469]:
X_train, X_test, Y_train, Y_test = train_test_split(
    df['text'], df['rating'], test_size=0.12)

count_vectorizer = CountVectorizer()
X_train_counts = count_vectorizer.fit_transform(X_train)
tf_transformer = TfidfTransformer()
X_train_tfidf = tf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(95, 906)

In [470]:
#clf = MultinomialNB().fit(X_train_tfidf, Y_train)
clf = SGDClassifier(loss='hinge', alpha=1e-3, n_iter=5, random_state=42).fit(X_train_tfidf, Y_train)

In [471]:
X_test_counts = count_vectorizer.transform(X_test)
X_test_tfidf = tf_transformer.transform(X_test_counts)

predicted = clf.predict(X_test_tfidf)

predicted

array(['complaint', 'complaint', 'complaint', 'compliment', 'complaint',
       'compliment', 'complaint', 'complaint', 'complaint', 'complaint',
       'complaint', 'compliment', 'complaint'], 
      dtype='<U10')

In [472]:
np.mean(predicted == Y_test)

0.84615384615384615