<a href="https://colab.research.google.com/github/arthurcaique/curso_verao_uninabuco/blob/master/sentiment_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from string import punctuation

from pandas import read_csv
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import download as nltk_downloader
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import (GridSearchCV, StratifiedKFold, 
                                     train_test_split)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (confusion_matrix, precision_recall_fscore_support)

nltk_downloader('punkt')
nltk_downloader('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [0]:
eng_stop_words = stopwords.words('english')

In [0]:
def pre_process_text(text: str):
  text = text.lower()
  return ' '.join([token for token in word_tokenize(text) 
                   if token not in punctuation and token not in eng_stop_words]).strip()

In [7]:
train_df = read_csv('train.tsv', sep = '\t', encoding='utf-8')
print(train_df.shape)

train_df.drop_duplicates(subset='SentenceId', keep='first', inplace=True)
print(train_df.shape)

X = train_df['Phrase'].apply(
    lambda phrase: pre_process_text(phrase))

train_df['Sentiment'] = train_df['Sentiment'].apply(
    lambda x: x if x not in [0, 3] else 1 if x == 0 else 4)

y = train_df['Sentiment'].values

(156060, 4)
(8529, 4)


In [0]:
X_train, X_test, y_train, y_test = train_test_split(
                                                    X, y, test_size=0.33, 
                                                    random_state=42)

In [0]:
vectorizer = TfidfVectorizer(max_features=10000)
tf_idf_matrix = vectorizer.fit_transform(X_train)

In [0]:
param_grid = {'n_estimators': [50, 100], 'class_weight': ['balanced', None]}

rf_classifier = RandomForestClassifier(random_state=1)

grid_search_model = GridSearchCV(
    estimator=rf_classifier, param_grid=param_grid, cv=3)

In [17]:
grid_search_model.fit(tf_idf_matrix, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=1, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [50, 100], 'class_weight': ['balanced', None]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [0]:
vectorized_test = vectorizer.transform(X_test)
predictions = grid_search_model.predict(vectorized_test)

In [19]:
confusion_matrix(y_true = y_test, y_pred = predictions, labels = list(set(y_train)))

array([[690, 111, 282],
       [226,  77, 229],
       [282,  86, 832]])