# Classification Models

## BOW Classifiers

In [64]:
import pandas as pd

sample_df = pd.read_csv('gs://ba820-project-files/sample_50000.csv')
bow = pd.read_csv('gs://ba820-project-files/bow_50000.csv')

In [65]:
y = sample_df['sentiment']
X = bow

In [66]:
from sklearn.model_selection import train_test_split

# train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [67]:
X_train.shape

(40000, 31363)

In [68]:
y_train.shape

(40000,)

## Minority Oversampling

In [6]:
pip install imblearn

Note: you may need to restart the kernel to use updated packages.


In [7]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Keep the test data the same
X_test_smote, y_test_smote = X_test, y_test

### LogReg without SMOTE

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, balanced_accuracy_score
from sklearn.metrics import confusion_matrix

model = LogisticRegression(C = 0.01, max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(f"Balanced accuracy:  {balanced_accuracy_score(y_test, y_pred)}")
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

Balanced accuracy:  0.517361399720158
Accuracy: 0.7585
              precision    recall  f1-score   support

    negative       0.68      0.54      0.60      2200
     neutral       0.44      0.06      0.11      1151
    positive       0.78      0.95      0.86      6649

    accuracy                           0.76     10000
   macro avg       0.63      0.52      0.52     10000
weighted avg       0.72      0.76      0.72     10000



### LogReg with SMOTE

In [11]:
model = LogisticRegression(C = 0.01, max_iter=1000)
model.fit(X_train_smote, y_train_smote)

y_pred_smote = model.predict(X_test_smote)

print(f"Balanced accuracy:  {balanced_accuracy_score(y_test_smote, y_pred_smote)}")
print(f"Accuracy: {accuracy_score(y_test_smote, y_pred_smote)}")
print(classification_report(y_test_smote, y_pred_smote))

Balanced accuracy:  0.5561664602849078
Accuracy: 0.6792
              precision    recall  f1-score   support

    negative       0.59      0.63      0.61      2200
     neutral       0.20      0.27      0.23      1151
    positive       0.84      0.77      0.80      6649

    accuracy                           0.68     10000
   macro avg       0.54      0.56      0.55     10000
weighted avg       0.71      0.68      0.69     10000



### Naive Bayes without SMOTE

In [69]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, balanced_accuracy_score

model = MultinomialNB(alpha = 1)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(f"Balanced accuracy:  {balanced_accuracy_score(y_test, y_pred)}")
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

Balanced accuracy:  0.5495540055208382
Accuracy: 0.7387
              precision    recall  f1-score   support

    negative       0.62      0.60      0.61      2200
     neutral       0.32      0.17      0.22      1151
    positive       0.81      0.88      0.84      6649

    accuracy                           0.74     10000
   macro avg       0.58      0.55      0.56     10000
weighted avg       0.71      0.74      0.72     10000



### Naive Bayes with SMOTE

In [9]:
model = MultinomialNB(alpha = 1)
model.fit(X_train_smote, y_train_smote)

y_pred_smote = model.predict(X_test_smote)

print(f"Balanced accuracy: {balanced_accuracy_score(y_test_smote, y_pred_smote)}")
print(f"Accuracy: {accuracy_score(y_test_smote, y_pred_smote)}")
print(classification_report(y_test_smote, y_pred_smote))

Balanced accuracy: 0.5678622523924242
Accuracy: 0.6754
              precision    recall  f1-score   support

    negative       0.59      0.61      0.60      2200
     neutral       0.22      0.33      0.27      1151
    positive       0.84      0.75      0.79      6649

    accuracy                           0.68     10000
   macro avg       0.55      0.57      0.55     10000
weighted avg       0.71      0.68      0.69     10000



## GloVe-50 Classifiers

In [53]:
emd = pd.read_csv('gs://ba820-project-files/GloVe_embeddings.csv', header=None)

In [54]:
X = emd
y = sample_df['sentiment']

In [55]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Minority Oversampling

In [56]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Keep the test data the same
X_test_smote, y_test_smote = X_test, y_test

### LogReg without SMOTE

In [44]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, balanced_accuracy_score
from sklearn.metrics import confusion_matrix

model = LogisticRegression(C = 10, max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(f"Balanced accuracy:  {balanced_accuracy_score(y_test, y_pred)}")
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

Balanced accuracy:  0.4009661512097969
Accuracy: 0.6902
              precision    recall  f1-score   support

    negative       0.56      0.25      0.34      2200
     neutral       0.00      0.00      0.00      1151
    positive       0.70      0.96      0.81      6649

    accuracy                           0.69     10000
   macro avg       0.42      0.40      0.38     10000
weighted avg       0.59      0.69      0.61     10000



### LogReg with SMOTE

In [45]:
model = LogisticRegression(C = 10, max_iter=1000)
model.fit(X_train_smote, y_train_smote)

y_pred_smote = model.predict(X_test_smote)

print(f"Balanced accuracy:  {balanced_accuracy_score(y_test_smote, y_pred_smote)}")
print(f"Accuracy: {accuracy_score(y_test_smote, y_pred_smote)}")
print(classification_report(y_test_smote, y_pred_smote))

Balanced accuracy:  0.4897822283707856
Accuracy: 0.5182
              precision    recall  f1-score   support

    negative       0.40      0.56      0.47      2200
     neutral       0.17      0.38      0.23      1151
    positive       0.82      0.53      0.64      6649

    accuracy                           0.52     10000
   macro avg       0.46      0.49      0.45     10000
weighted avg       0.65      0.52      0.56     10000



### Naive Bayes without SMOTE

In [46]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score, balanced_accuracy_score

priors=[0.22,0.12,0.66]

model = GaussianNB(priors=priors)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(f"Balanced accuracy:  {balanced_accuracy_score(y_test, y_pred)}")
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

Balanced accuracy:  0.4423744723533159
Accuracy: 0.613
              precision    recall  f1-score   support

    negative       0.40      0.36      0.38      2200
     neutral       0.19      0.20      0.20      1151
    positive       0.75      0.77      0.76      6649

    accuracy                           0.61     10000
   macro avg       0.45      0.44      0.44     10000
weighted avg       0.61      0.61      0.61     10000



### Naive Bayes with SMOTE

In [57]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score, balanced_accuracy_score

model = GaussianNB()
model.fit(X_train_smote, y_train_smote)

y_pred_smote = model.predict(X_test_smote)

print(f"Balanced accuracy:  {balanced_accuracy_score(y_test_smote, y_pred_smote)}")
print(f"Accuracy: {accuracy_score(y_test_smote, y_pred_smote)}")
print(classification_report(y_test_smote, y_pred_smote))

Balanced accuracy:  0.4256002420314789
Accuracy: 0.4023
              precision    recall  f1-score   support

    negative       0.40      0.32      0.35      2200
     neutral       0.14      0.55      0.22      1151
    positive       0.75      0.40      0.53      6649

    accuracy                           0.40     10000
   macro avg       0.43      0.43      0.37     10000
weighted avg       0.60      0.40      0.45     10000



## GloVe-300 Classifiers

In [58]:
emd = pd.read_csv('gs://ba820-project-files/GloVe_embeddings_50000_300.csv', header=None)

In [59]:
X = emd
y = sample_df['sentiment']

In [60]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Minority Oversampling

In [61]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Keep the test data the same
X_test_smote, y_test_smote = X_test, y_test

### LogReg without SMOTE

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, balanced_accuracy_score
from sklearn.metrics import confusion_matrix

model = LogisticRegression(C = 10, max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(f"Balanced accuracy:  {balanced_accuracy_score(y_test, y_pred)}")
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

Balanced accuracy:  0.4665031714621901
Accuracy: 0.7205
              precision    recall  f1-score   support

    negative       0.61      0.45      0.52      2200
     neutral       0.27      0.02      0.03      1151
    positive       0.75      0.93      0.83      6649

    accuracy                           0.72     10000
   macro avg       0.54      0.47      0.46     10000
weighted avg       0.66      0.72      0.67     10000



### LogReg with SMOTE

In [31]:
model = LogisticRegression(C = 10, max_iter=1000)
model.fit(X_train_smote, y_train_smote)

y_pred_smote = model.predict(X_test_smote)

print(f"Balanced accuracy:  {balanced_accuracy_score(y_test_smote, y_pred_smote)}")
print(f"Accuracy: {accuracy_score(y_test_smote, y_pred_smote)}")
print(classification_report(y_test_smote, y_pred_smote))

Balanced accuracy:  0.5545018614944484
Accuracy: 0.5875
              precision    recall  f1-score   support

    negative       0.48      0.61      0.54      2200
     neutral       0.20      0.45      0.28      1151
    positive       0.86      0.60      0.71      6649

    accuracy                           0.59     10000
   macro avg       0.52      0.55      0.51     10000
weighted avg       0.70      0.59      0.62     10000



### Naive Bayes without SMOTE

In [62]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score, balanced_accuracy_score

priors=[0.22,0.12,0.66]

model = GaussianNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(f"Balanced accuracy:  {balanced_accuracy_score(y_test, y_pred)}")
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

Balanced accuracy:  0.46228543506574704
Accuracy: 0.4595
              precision    recall  f1-score   support

    negative       0.40      0.36      0.38      2200
     neutral       0.15      0.55      0.24      1151
    positive       0.81      0.48      0.60      6649

    accuracy                           0.46     10000
   macro avg       0.45      0.46      0.41     10000
weighted avg       0.65      0.46      0.51     10000



### Naive Bayes with SMOTE

In [63]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score, balanced_accuracy_score

model = GaussianNB()
model.fit(X_train_smote, y_train_smote)

y_pred_smote = model.predict(X_test_smote)

print(f"Balanced accuracy:  {balanced_accuracy_score(y_test_smote, y_pred_smote)}")
print(f"Accuracy: {accuracy_score(y_test_smote, y_pred_smote)}")
print(classification_report(y_test_smote, y_pred_smote))

Balanced accuracy:  0.4185628281148344
Accuracy: 0.4067
              precision    recall  f1-score   support

    negative       0.42      0.28      0.34      2200
     neutral       0.14      0.55      0.22      1151
    positive       0.72      0.42      0.53      6649

    accuracy                           0.41     10000
   macro avg       0.43      0.42      0.36     10000
weighted avg       0.59      0.41      0.45     10000

