In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score

import time
from datetime import datetime

from IPython.core.display import HTML
import tba3102
import model_evaluation_utils as meu



display(HTML("<style>pre { white-space: pre !important; }</style>"))
tba3102.set_default_pandas_options()

np.random.seed(int(round(time.time())))
random_state = 1559518326 # np.random.randint(2**31-1)
print('random_state: {}'.format(random_state))

random_state: 1559518326


In [2]:
def classify(model, train_features, train_label, test_features, test_label):
    
    model.fit(train_features, train_label)
    cv_scores = cross_val_score(model, train_features, train_label, cv=5)
    cv_mean_score = np.mean(cv_scores)

    print('Training Accuracy (5-fold):', cv_scores)
    print('Mean Training Accuracy:', cv_mean_score)
    
    test_score = model.score(test_features, test_label)
    print('Testing Accuracy:', test_score)
    
    predictions = model.predict(test_features)
    unique_classes = list(set(test_label))
    
    meu.get_metrics(true_labels=test_label, predicted_labels=predictions)
    
    meu.display_classification_report(true_labels=test_label, predicted_labels=predictions, classes=unique_classes)
    
    print(metrics.confusion_matrix(y_true=test_label, y_pred=predictions, labels=unique_classes))

In [3]:
print('Text processing started at {}'.format(datetime.now()))

Text processing started at 2023-04-04 20:41:44.534984


In [4]:
# Logistic Regression
lr = LogisticRegression(penalty='l2', max_iter=500, C=1, random_state=random_state, solver='lbfgs')

# Support Vector Machines
svm = LinearSVC(penalty='l2', max_iter=10000, C=1, random_state=random_state)

datasets = ['without_emoticon', 'with_emoticon', 'with_emoticon_nospelling_nolemma']
supervised_learning_models = {'Logistic Regression':lr, 'Support Vector Machines':svm}

for dataset in datasets:
    
    dataset_filename = 'training_16000_cleaned_' + dataset + '.csv'
    
    df = pd.read_csv('../data/' + dataset_filename, encoding='ISO-8859-1')
    df.loc[df['polarity'] == 0, 'polarity'] = 'negative'
    df.loc[df['polarity'] == 4, 'polarity'] = 'positive'
    
    train_corpus, test_corpus, train_label, test_label = train_test_split(np.array(df['cleaned_tweet']),
                                                                            np.array(df['polarity']),                                                                                                                        
                                                                            test_size=0.30, 
                                                                            shuffle=True,
                                                                            random_state=random_state,
                                                                            stratify=df['polarity'])
    
    # build BOW features    
    bow_tf = CountVectorizer(binary=False, min_df=0.0, max_df=1.0, ngram_range=(1,2))
    bow_tf_train_features = bow_tf.fit_transform(train_corpus)
    bow_tf_test_features = bow_tf.transform(test_corpus)

    # build TFIDF features
    bow_tfidf = TfidfVectorizer(use_idf=True, min_df=0.0, max_df=1.0, ngram_range=(1,2))
    bow_tfidf_train_features = bow_tfidf.fit_transform(train_corpus)
    bow_tfidf_test_features = bow_tfidf.transform(test_corpus)
    
    feature_engineering_models = {'BOW-TF':{'train':bow_tf_train_features, 'test':bow_tf_test_features},
                                  'BOW-TFIDF':{'train':bow_tfidf_train_features, 'test':bow_tfidf_test_features}}
    
    for fg_key in feature_engineering_models.keys():
        
        for sl_key in supervised_learning_models.keys():
            
            print('{} {} {}'.format(dataset, fg_key, sl_key))
            print('-' * 80)
            
            classify(supervised_learning_models[sl_key], feature_engineering_models[fg_key]['train'], train_label, feature_engineering_models[fg_key]['test'], test_label)
            
            print('=' * 80)

without_emoticon BOW-TF Logistic Regression
--------------------------------------------------------------------------------
Training Accuracy (5-fold): [0.73380974 0.73604288 0.74542206 0.73023671 0.73023671]
Mean Training Accuracy: 0.7351496203662349
Testing Accuracy: 0.7447384871848302
Accuracy: 0.7447
Precision: 0.7452
Recall: 0.7447
F1 Score: 0.7446
              precision    recall  f1-score   support

    negative       0.76      0.72      0.74      2399
    positive       0.73      0.77      0.75      2400

    accuracy                           0.74      4799
   macro avg       0.75      0.74      0.74      4799
weighted avg       0.75      0.74      0.74      4799

[[1735  664]
 [ 561 1839]]
without_emoticon BOW-TF Support Vector Machines
--------------------------------------------------------------------------------
Training Accuracy (5-fold): [0.72711032 0.7172845  0.74542206 0.72085753 0.7204109 ]
Mean Training Accuracy: 0.7262170611880304
Testing Accuracy: 0.731402375494

In [5]:
print('Text processing ended at {}'.format(datetime.now()))

Text processing ended at 2023-04-04 20:42:13.909208
