# SVM - TFIDF

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

data = pd.read_csv("./input/validation_sentiment.csv")

tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(data['text'])
y = data['annotation']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

svm_classifier = SVC(probability=True)
svm_classifier.fit(X_train, y_train)

y_pred_proba = svm_classifier.predict_proba(X_test)
y_pred = (y_pred_proba[:, 1] > 0.5).astype(int)

accuracy = accuracy_score(y_test, y_pred)
print(f"Acurácia do modelo: {round(accuracy*100, 2)}%")

if os.path.exists("./output/resultados.csv"):
    results = pd.read_csv("./output/resultados.csv")
else:
    results = pd.DataFrame(columns=["text", "SVM_TFIDF"])

total_rows = df.shape[0]
num_rows_to_process = int(0.2 * total_rows)
results["text"] = df["text"].values[:num_rows_to_process + 1]
results["annotation"] = df["annotation"].values[:num_rows_to_process + 1]
results["sentiment"] = df["sentiment"].values[:num_rows_to_process + 1]
results["SVM_TFIDF"] = y_pred

results.to_csv("./output/resultados.csv", index=False)

# SVM - word2vec

In [None]:
import pandas as pd
import pickle
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from gensim.models import KeyedVectors

data = pd.read_csv("./input/validation_sentiment.csv")
sentences = [text.split() for text in data['text']]

model = KeyedVectors.load_word2vec_format('../models/GoogleNews-vectors-negative300.bin', binary=True)


def get_document_vector(tokens, model):
    vector = np.zeros(300)
    count = 0
    for token in tokens:
        if token in model.key_to_index:
            vector = np.add(vector, model[token])
            count += 1
    if count != 0:
        vector = np.divide(vector, count)
    return vector


X = [get_document_vector(tokens, model) for tokens in sentences]
y = data['annotation']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

svm_classifier = SVC(probability=True)
svm_classifier.fit(X_train, y_train)

with open('SVM.pkl', 'wb') as model_file:
    pickle.dump(svm_classifier, model_file)

y_pred_proba = svm_classifier.predict_proba(X_test)
y_pred = (y_pred_proba[:, 1] > 0.5).astype(int)
accuracy = accuracy_score(y_test, y_pred)
print(f"Acurácia do modelo: {round(accuracy*100, 2)}%")

results = pd.read_csv("./output/resultados.csv")
results["SVM_W2V"] = y_pred
results.to_csv("./output/resultados.csv", index=False)

# KNN - TFIDF

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

data = pd.read_csv("./input/validation_sentiment.csv")

tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(data['text'])
y = data['annotation']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train, y_train)

y_pred_proba = knn_classifier.predict_proba(X_test)
y_pred = (y_pred_proba[:, 1] > 0.5).astype(int)

accuracy = accuracy_score(y_test, y_pred)
print(f"Acurácia do modelo: {round(accuracy*100, 2)}%")

results = pd.read_csv("./output/resultados.csv")
results["KNN_TFIDF"] = y_pred
results.to_csv("./output/resultados.csv", index=False)

# KNN - word2vec

In [None]:
import pandas as pd
from gensim.models import Word2Vec
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

data = pd.read_csv("./input/validation_sentiment.csv")

sentences = [text.split() for text in data['text']]

model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, sg=0)


def get_document_vector(tokens, model, vector_size):
    vector = np.zeros(vector_size)
    count = 0
    for token in tokens:
        if token in model.wv:
            vector = np.add(vector, model.wv[token])
            count += 1
    if count != 0:
        vector = np.divide(vector, count)
    return vector


X = [get_document_vector(tokens, model, 100) for tokens in sentences]
y = data['annotation']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train, y_train)

y_pred_proba = knn_classifier.predict_proba(X_test)
y_pred = (y_pred_proba[:, 1] > 0.5).astype(int)

accuracy = accuracy_score(y_test, y_pred)
print(f"Acurácia do modelo: {round(accuracy*100, 2)}%")

results = pd.read_csv("./output/resultados.csv")
results["KNN_W2V"] = y_pred
results.to_csv("./output/resultados.csv", index=False)

# Naive Bayes - TFIDF

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

data = pd.read_csv("./input/validation_sentiment.csv")

tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(data['text'])

y = data['annotation']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train, y_train)

y_pred_proba = naive_bayes_classifier.predict_proba(X_test)
y_pred = (y_pred_proba[:, 1] > 0.5).astype(int)

accuracy = accuracy_score(y_test, y_pred)
print(f"Acurácia do modelo: {round(accuracy*100, 2)}%")

results = pd.read_csv("./output/resultados.csv")
results["NV_TFIDF"] = y_pred
results.to_csv("./output/resultados.csv", index=False)

# Naive Bayes - word2vec

In [None]:
import pandas as pd
from gensim.models import Word2Vec
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

data = pd.read_csv("./input/validation_sentiment.csv")

sentences = [text.split() for text in data['text']]

model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, sg=0)


def average_word_vectors(words, model, num_features):
    feature_vector = np.zeros((num_features,), dtype="float64")
    n_words = 0

    for word in words:
        if word in model.wv:
            n_words += 1
            feature_vector = np.add(feature_vector, model.wv[word])

    if n_words > 0:
        feature_vector = np.divide(feature_vector, n_words)

    return feature_vector


X = [average_word_vectors(words, model, 100) for words in sentences]
y = data['annotation']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

logistic_regression_model = LogisticRegression()
logistic_regression_model.fit(X_train, y_train)

y_pred_proba = logistic_regression_model.predict_proba(X_test)
y_pred = (y_pred_proba[:, 1] > 0.5).astype(int)

accuracy = accuracy_score(y_test, y_pred)
print(f"Acurácia do modelo: {round(accuracy*100, 2)}%")

results = pd.read_csv("./output/resultados.csv")
results["NV_W2V"] = y_pred
results.to_csv("./output/resultados.csv", index=False)

# RandomForest - TFIDF

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

data = pd.read_csv("./input/validation_sentiment.csv")

tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(data['text'])

y = data['annotation']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

random_forest_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_classifier.fit(X_train, y_train)

y_pred_proba = random_forest_classifier.predict_proba(X_test)
y_pred = (y_pred_proba[:, 1] > 0.5).astype(int)

accuracy = accuracy_score(y_test, y_pred)
print(f"Acurácia do modelo: {round(accuracy*100, 2)}%")

results = pd.read_csv("./output/resultados.csv")
results["RF_TFIDF"] = y_pred
results.to_csv("./output/resultados.csv", index=False)

# RandomForest - word2vec

In [None]:
import pandas as pd
from gensim.models import Word2Vec
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

data = pd.read_csv("./input/validation_sentiment.csv")

sentences = [text.split() for text in data['text']]

model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, sg=0)


def average_word_vectors(words, model, num_features):
    feature_vector = np.zeros((num_features,), dtype="float64")
    n_words = 0

    for word in words:
        if word in model.wv:
            n_words += 1
            feature_vector = np.add(feature_vector, model.wv[word])

    if n_words > 0:
        feature_vector = np.divide(feature_vector, n_words)

    return feature_vector


X = [average_word_vectors(words, model, 100) for words in sentences]
y = data['annotation']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

random_forest_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_classifier.fit(X_train, y_train)

y_pred_proba = random_forest_classifier.predict_proba(X_test)
y_pred = (y_pred_proba[:, 1] > 0.5).astype(int)

accuracy = accuracy_score(y_test, y_pred)
print(f"Acurácia do modelo: {round(accuracy*100, 2)}%")

results = pd.read_csv("./output/resultados.csv")
results["RF_W2V"] = y_pred
results.to_csv("./output/resultados.csv", index=False)