In [12]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import metrics
import pandas as pd
from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelBinarizer
import warnings
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.exceptions import ConvergenceWarning, UndefinedMetricWarning

In [3]:
data = pd.read_csv('dataset_sentiment.csv', encoding='latin1')

In [4]:
data.head()

Unnamed: 0,id,sentiment,date,text,Unnamed: 4,Unnamed: 5
0,623495523,1,Mon Dec 01 20:46:01 +0000 2014,WTF MY BATTERY WAS 31% ONE SECOND AGO AND NOW ...,,
1,623495527,1,Mon Dec 01 21:09:50 +0000 2014,@apple Contact sync between Yosemite and iOS8 ...,,
2,623495529,1,Mon Dec 01 21:35:14 +0000 2014,WARNING IF YOU BUY AN IPHONE 5S UNLOCKED FROM ...,,
3,623495536,1,Mon Dec 01 23:55:55 +0000 2014,"@Apple, For the love of GAWD, CENTER the '1'on...",,
4,623495537,1,Tue Dec 02 00:06:05 +0000 2014,i get the storage almost full notification lit...,,


In [8]:
# Will drop these 2 columns because they don't have any value

data.drop(['Unnamed: 4', 'Unnamed: 5'], axis=1, inplace=True)

In [9]:
data.head()

Unnamed: 0,id,sentiment,date,text
0,623495523,1,Mon Dec 01 20:46:01 +0000 2014,WTF MY BATTERY WAS 31% ONE SECOND AGO AND NOW ...
1,623495527,1,Mon Dec 01 21:09:50 +0000 2014,@apple Contact sync between Yosemite and iOS8 ...
2,623495529,1,Mon Dec 01 21:35:14 +0000 2014,WARNING IF YOU BUY AN IPHONE 5S UNLOCKED FROM ...
3,623495536,1,Mon Dec 01 23:55:55 +0000 2014,"@Apple, For the love of GAWD, CENTER the '1'on..."
4,623495537,1,Tue Dec 02 00:06:05 +0000 2014,i get the storage almost full notification lit...


In [10]:
train_data, test_data, train_labels, test_labels = train_test_split(
    data['text'], data['sentiment'], test_size=0.2, random_state=42
)

In [13]:
# Bag of words based on raw counts
count_vectorizer = CountVectorizer()
X_train_count = count_vectorizer.fit_transform(train_data)
X_test_count = count_vectorizer.transform(test_data)

# Bag of words based on TfIDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data)
X_test_tfidf = tfidf_vectorizer.transform(test_data)

# n-grams (unigrams, bigrams, trigrams)
ngram_vectorizer = CountVectorizer(ngram_range=(1, 3))
X_train_ngram = ngram_vectorizer.fit_transform(train_data)
X_test_ngram = ngram_vectorizer.transform(test_data)

# Classifiers
classifiers = [
    MultinomialNB(),
    LogisticRegression(max_iter=1000),
    RandomForestClassifier(),
    SVC(),
    Perceptron(),
]

results = []

for classifier in classifiers:
    classifier_name = classifier.__class__.__name__

    try:
        # Training
        classifier.fit(X_train_count, train_labels)

        # Testing
        predictions = classifier.predict(X_test_count)

        # Evaluation metrics
        accuracy = metrics.accuracy_score(test_labels, predictions)
        precision_micro = metrics.precision_score(test_labels, predictions, average='micro', zero_division=0)
        recall_micro = metrics.recall_score(test_labels, predictions, average='micro', zero_division=0)
        f1_micro = metrics.f1_score(test_labels, predictions, average='micro', zero_division=0)

        precision_macro = metrics.precision_score(test_labels, predictions, average='macro', zero_division=0)
        recall_macro = metrics.recall_score(test_labels, predictions, average='macro', zero_division=0)
        f1_macro = metrics.f1_score(test_labels, predictions, average='macro', zero_division=0)

        results.append([classifier_name, accuracy, precision_micro, recall_micro, f1_micro,
                        precision_macro, recall_macro, f1_macro])

    except ConvergenceWarning as e:
        print(f"Warning: {classifier_name} - {str(e)}")
    except UndefinedMetricWarning as e:
        print(f"Warning: {classifier_name} - {str(e)}")

In [14]:
results_df = pd.DataFrame(results, columns=['Classifier', 'Accuracy', 'Precision (Micro)', 'Recall (Micro)', 'F1 Score (Micro)',
                                             'Precision (Macro)', 'Recall (Macro)', 'F1 Score (Macro)'])
results_df

Unnamed: 0,Classifier,Accuracy,Precision (Micro),Recall (Micro),F1 Score (Micro),Precision (Macro),Recall (Macro),F1 Score (Macro)
0,MultinomialNB,0.726221,0.726221,0.726221,0.726221,0.525596,0.429674,0.42585
1,LogisticRegression,0.748072,0.748072,0.748072,0.748072,0.509881,0.451867,0.462568
2,RandomForestClassifier,0.750643,0.750643,0.750643,0.750643,0.593324,0.444814,0.459873
3,SVC,0.735219,0.735219,0.735219,0.735219,0.586433,0.418947,0.42197
4,Perceptron,0.733933,0.733933,0.733933,0.733933,0.545199,0.463601,0.485623
