In [121]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

In [107]:
df = pd.read_csv("APsentiments.csv")

In [108]:
df.head()

Unnamed: 0,sentence,sentiment
0,It also increases carbon dioxide emissions whi...,neutral
1,We can already see this happening.\t,negative
2,The ecological disaster is a consequence of no...,postive
3,We may be dealing with an issue with a level o...,negative
4,Preventable chronic diseases are Australiaâ€™s...,negative


In [109]:
df.sentiment.value_counts()

neutral     336
negative    278
postive     185
Name: sentiment, dtype: int64

In [110]:
vectorizer = TfidfVectorizer(use_idf=True, norm="l2", stop_words="english", max_df=0.7)
X = vectorizer.fit_transform(df.sentence)
y = df.sentiment

In [111]:
X.shape, y.shape

((799, 4619), (799,))

In [112]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

## K-NN

In [104]:
score_max = 0                      # Score_max is a temoporay variable to store the max score 
for param in [1, 3, 10, 30]:
    model = KNeighborsClassifier(n_neighbors=param)
    scores = cross_val_score(model, X_train, y_train, cv=5)
    print("k = {}: {}\n{:.3f}, {:.3f}\n".format(param, scores, scores.mean(), scores.std()))
    
    if scores.mean() > score_max:
        score_max = scores.mean()
        param_best = param         # Param_best is a temoporay variable to store the best parameter 
        
print("Highest score : {:.3f} when k = {}".format(score_max, param_best))

k = 1: [0.41071429 0.41964286 0.41071429 0.38392857 0.41441441]
0.408, 0.012

k = 3: [0.41071429 0.41071429 0.41071429 0.4375     0.41441441]
0.417, 0.010

k = 10: [0.44642857 0.48214286 0.36607143 0.4375     0.44144144]
0.435, 0.038

k = 30: [0.47321429 0.46428571 0.36607143 0.46428571 0.45945946]
0.445, 0.040

Highest score : 0.445 when k = 30


In [105]:
def train_test(X_train, X_test, y_train, y_test, classifier):
    classifier.fit(X_train, y_train)
    pred = classifier.predict(X_test)
    
    print("Train score: {:.2f}".format(classifier.score(X_train, y_train)))
    print("Test score: {:.2f}\n".format(classifier.score(X_test, y_test)))
    print("Classification report:\n{}".format(classification_report(y_test, pred, zero_division=0)))
    print(confusion_matrix(y_test,pred))
    
    return classifier

In [95]:
print("k = {}".format(param_best))
knn = KNeighborsClassifier(n_neighbors=param_best)
knn = train_test(X_train, X_test, y_train, y_test, knn)

k = 30
Train score: 0.49
Test score: 0.41

Classification report:
              precision    recall  f1-score   support

    negative       0.34      0.24      0.28        88
     neutral       0.43      0.73      0.54       105
     postive       0.00      0.00      0.00        47

    accuracy                           0.41       240
   macro avg       0.26      0.32      0.27       240
weighted avg       0.31      0.41      0.34       240

[[21 67  0]
 [28 77  0]
 [13 34  0]]


In [96]:
summary = {}
summary["k-NNs"] = round(knn.score(X_test, y_test), 3)

## Logistic Regression

In [38]:
lr = LogisticRegression()

In [39]:
scores = cross_val_score(lr, X_train, y_train, cv=5)
print("{}\n{:.3f}, {:.3f}".format(scores, scores.mean(), scores.std()))

[0.48214286 0.45535714 0.36607143 0.47321429 0.45945946]
0.447, 0.042


In [41]:
lr = train_test(X_train, X_test, y_train, y_test, lr)

Train score: 0.99
Test score: 0.45

Classification report:
              precision    recall  f1-score   support

    negative       0.43      0.23      0.30        88
     neutral       0.45      0.83      0.59       105
     postive       1.00      0.02      0.04        47

    accuracy                           0.45       240
   macro avg       0.63      0.36      0.31       240
weighted avg       0.55      0.45      0.37       240

[[20 68  0]
 [18 87  0]
 [ 9 37  1]]


In [48]:
summary["Logistic Regression"] = round(lr.score(X_test, y_test), 3)

## Multinomial NB

In [42]:
mnb = MultinomialNB()
mnb

MultinomialNB()

In [43]:
scores = cross_val_score(mnb, X_train, y_train, cv=5)
print("{}\n{:.3f}, {:.3f}".format(scores, scores.mean(), scores.std()))

[0.46428571 0.48214286 0.39285714 0.51785714 0.48648649]
0.469, 0.042


In [44]:
mnb = train_test(X_train, X_test, y_train, y_test, mnb)

Train score: 0.95
Test score: 0.45

Classification report:
              precision    recall  f1-score   support

    negative       0.43      0.25      0.32        88
     neutral       0.46      0.82      0.59       105
     postive       1.00      0.02      0.04        47

    accuracy                           0.45       240
   macro avg       0.63      0.36      0.32       240
weighted avg       0.55      0.45      0.38       240

[[22 66  0]
 [19 86  0]
 [10 36  1]]


In [49]:
summary["Multinomial Naive Bayes"] = round(mnb.score(X_test, y_test), 3)

## Modeling with Linear Support Vector Machines (SVMs)

In [51]:
svm = LinearSVC(C=1)
svm

LinearSVC(C=1)

In [52]:
svm = train_test(X_train, X_test, y_train, y_test, svm)

Train score: 1.00
Test score: 0.43

Classification report:
              precision    recall  f1-score   support

    negative       0.41      0.35      0.38        88
     neutral       0.44      0.64      0.52       105
     postive       0.36      0.11      0.16        47

    accuracy                           0.43       240
   macro avg       0.40      0.37      0.36       240
weighted avg       0.42      0.43      0.40       240

[[31 55  2]
 [31 67  7]
 [13 29  5]]


In [53]:
score_max = 0
for param in [0.01, 0.03, 0.1, 0.3, 1, 3, 10]:
    model = LinearSVC(C=param)
    scores = cross_val_score(model, X_train, y_train, cv=5)
    print("C = {}: {}\n{:.3f}, {:.3f}\n".format(param, scores, scores.mean(), scores.std()))
    
    if scores.mean() > score_max:
        score_max = scores.mean()
        param_best = param
        
print("Highest score : {:.3f} when C = {}".format(score_max, param_best))

C = 0.01: [0.41071429 0.41071429 0.41071429 0.41964286 0.41441441]
0.413, 0.004

C = 0.03: [0.41071429 0.41964286 0.41071429 0.41071429 0.41441441]
0.413, 0.004

C = 0.1: [0.47321429 0.45535714 0.375      0.47321429 0.45945946]
0.447, 0.037

C = 0.3: [0.4375     0.49107143 0.375      0.49107143 0.45045045]
0.449, 0.043

C = 1: [0.48214286 0.44642857 0.33035714 0.48214286 0.43243243]
0.435, 0.056

C = 3: [0.49107143 0.44642857 0.34821429 0.46428571 0.41441441]
0.433, 0.049

C = 10: [0.48214286 0.42857143 0.375      0.4375     0.41441441]
0.428, 0.035

Highest score : 0.449 when C = 0.3


In [54]:
print("C = {}".format(param_best))
svm = LinearSVC(C=param_best)
svm = train_test(X_train, X_test, y_train, y_test, svm)

C = 0.3
Train score: 1.00
Test score: 0.45

Classification report:
              precision    recall  f1-score   support

    negative       0.45      0.31      0.36        88
     neutral       0.45      0.75      0.56       105
     postive       0.40      0.04      0.08        47

    accuracy                           0.45       240
   macro avg       0.43      0.37      0.34       240
weighted avg       0.44      0.45      0.40       240

[[27 61  0]
 [23 79  3]
 [10 35  2]]


In [55]:
summary["Linear SVMs"] = round(svm.score(X_test, y_test), 3)

## Modeling with Kernelized Support Vector Machines (KSVMs)

In [57]:
ksvm = SVC(C=1, kernel="rbf", gamma="scale")
ksvm

SVC(C=1)

In [58]:
ksvm = train_test(X_train, X_test, y_train, y_test, ksvm)

Train score: 1.00
Test score: 0.45

Classification report:
              precision    recall  f1-score   support

    negative       0.67      0.05      0.09        88
     neutral       0.45      1.00      0.62       105
     postive       0.00      0.00      0.00        47

    accuracy                           0.45       240
   macro avg       0.37      0.35      0.23       240
weighted avg       0.44      0.45      0.30       240

[[  4  84   0]
 [  0 105   0]
 [  2  45   0]]


In [59]:
score_max = 0
for param in [0.01, 0.03, 0.1, 0.3, 1, 3, 10]:
    model = SVC(C=param, kernel="rbf", gamma="scale")
    scores = cross_val_score(model, X_train, y_train, cv=5)
    print("C = {}: {}\n{:.3f}, {:.3f}\n".format(param, scores, scores.mean(), scores.std()))
    
    if scores.mean() > score_max:
        score_max = scores.mean()
        param_best = param
        
print("Highest score : {:.3f} when C = {}".format(score_max, param_best))

C = 0.01: [0.41071429 0.41071429 0.41071429 0.41964286 0.41441441]
0.413, 0.004

C = 0.03: [0.41071429 0.41071429 0.41071429 0.41964286 0.41441441]
0.413, 0.004

C = 0.1: [0.41071429 0.41071429 0.41071429 0.41964286 0.41441441]
0.413, 0.004

C = 0.3: [0.41071429 0.41071429 0.41071429 0.41964286 0.41441441]
0.413, 0.004

C = 1: [0.41071429 0.4375     0.41964286 0.40178571 0.42342342]
0.419, 0.012

C = 3: [0.47321429 0.46428571 0.375      0.48214286 0.45045045]
0.449, 0.038

C = 10: [0.47321429 0.46428571 0.375      0.48214286 0.45045045]
0.449, 0.038

Highest score : 0.449 when C = 3


In [60]:
print("C = {}".format(param_best))
ksvm = SVC(C=param_best)
ksvm = train_test(X_train, X_test, y_train, y_test, ksvm)

C = 3
Train score: 1.00
Test score: 0.46

Classification report:
              precision    recall  f1-score   support

    negative       0.45      0.24      0.31        88
     neutral       0.46      0.84      0.59       105
     postive       1.00      0.02      0.04        47

    accuracy                           0.46       240
   macro avg       0.64      0.37      0.32       240
weighted avg       0.56      0.46      0.38       240

[[21 67  0]
 [17 88  0]
 [ 9 37  1]]


In [61]:
summary["Kernelized SVMs"] = round(ksvm.score(X_test, y_test), 3)

## Modeling with Neural Networks

In [64]:
mlp = MLPClassifier(hidden_layer_sizes=(10, ), activation="relu", random_state=0)
mlp

MLPClassifier(hidden_layer_sizes=(10,), random_state=0)

In [65]:
mlp = train_test(X_train, X_test, y_train, y_test, mlp)

Train score: 1.00
Test score: 0.44

Classification report:
              precision    recall  f1-score   support

    negative       0.40      0.39      0.40        88
     neutral       0.47      0.62      0.53       105
     postive       0.35      0.13      0.19        47

    accuracy                           0.44       240
   macro avg       0.41      0.38      0.37       240
weighted avg       0.42      0.44      0.41       240

[[34 49  5]
 [34 65  6]
 [16 25  6]]




In [66]:
score_max = 0
for param in [10, 30, 100]:
    model = MLPClassifier(hidden_layer_sizes=(param, ), activation="relu", random_state=0)
    scores = cross_val_score(model, X_train, y_train, cv=5)
    print("hidden_layer_size = {}: {}\n{:.3f}, {:.3f}\n".format(param, scores, scores.mean(), scores.std()))
    
    if scores.mean() > score_max:
        score_max = scores.mean()
        param_best = param
        
print("Highest score : {:.3f} when hidden_layer_sizes = {}".format(score_max, param_best))



hidden_layer_size = 10: [0.5        0.47321429 0.30357143 0.42857143 0.45045045]
0.431, 0.068





hidden_layer_size = 30: [0.44642857 0.44642857 0.32142857 0.41964286 0.46846847]
0.420, 0.052

hidden_layer_size = 100: [0.46428571 0.45535714 0.33928571 0.41964286 0.45045045]
0.426, 0.046

Highest score : 0.431 when hidden_layer_sizes = 10


In [67]:
print("hidden_layer_size = {}".format(param_best))
mlp = MLPClassifier(hidden_layer_sizes=(param_best, ), random_state=0)
mlp = train_test(X_train, X_test, y_train, y_test, mlp)

hidden_layer_size = 10
Train score: 1.00
Test score: 0.44

Classification report:
              precision    recall  f1-score   support

    negative       0.40      0.39      0.40        88
     neutral       0.47      0.62      0.53       105
     postive       0.35      0.13      0.19        47

    accuracy                           0.44       240
   macro avg       0.41      0.38      0.37       240
weighted avg       0.42      0.44      0.41       240

[[34 49  5]
 [34 65  6]
 [16 25  6]]




In [68]:
summary["Neural Networks"] = round(mlp.score(X_test, y_test), 3)

## Decision Tree Classifier

In [115]:
dt = tree.DecisionTreeClassifier()
dt

DecisionTreeClassifier()

In [116]:
scores = cross_val_score(dt, X_train, y_train, cv=5)
print("{}\n{:.3f}, {:.3f}".format(scores, scores.mean(), scores.std()))

[0.33928571 0.41071429 0.38392857 0.44642857 0.42342342]
0.401, 0.037


In [117]:
dt = train_test(X_train, X_test, y_train, y_test, dt)

Train score: 1.00
Test score: 0.40

Classification report:
              precision    recall  f1-score   support

    negative       0.40      0.28      0.33        88
     neutral       0.43      0.61      0.50       105
     postive       0.22      0.13      0.16        47

    accuracy                           0.40       240
   macro avg       0.35      0.34      0.33       240
weighted avg       0.38      0.40      0.37       240

[[25 55  8]
 [28 64 13]
 [10 31  6]]


In [118]:
summary["Decision Tree"] = round(dt.score(X_test, y_test), 3)

## Random Forest

In [148]:
score_max = 0
for param in [1, 3, 10, 12, 14, 16, 18, 20]:
    model = RandomForestClassifier(max_depth=param)
    scores = cross_val_score(model, X_train, y_train, cv=5)
    print("C = {}: {}\n{:.3f}, {:.3f}\n".format(param, scores, scores.mean(), scores.std()))
    
    if scores.mean() > score_max:
        score_max = scores.mean()
        param_best = param
        
print("Highest score : {:.3f} when C = {}".format(score_max, param_best))

C = 1: [0.41071429 0.41071429 0.41071429 0.41964286 0.41441441]
0.413, 0.004

C = 3: [0.41071429 0.41071429 0.41071429 0.41964286 0.41441441]
0.413, 0.004

C = 10: [0.41071429 0.42857143 0.41964286 0.41071429 0.42342342]
0.419, 0.007

C = 12: [0.42857143 0.41964286 0.41964286 0.41071429 0.42342342]
0.420, 0.006

C = 14: [0.39285714 0.42857143 0.41964286 0.4375     0.42342342]
0.420, 0.015

C = 16: [0.42857143 0.45535714 0.42857143 0.41964286 0.43243243]
0.433, 0.012

C = 18: [0.4375     0.40178571 0.42857143 0.44642857 0.43243243]
0.429, 0.015

C = 20: [0.41964286 0.42857143 0.39285714 0.4375     0.43243243]
0.422, 0.016

Highest score : 0.433 when C = 16


In [149]:
print("C = {}".format(param_best))
rf = RandomForestClassifier(max_depth=param_best)
rf = train_test(X_train, X_test, y_train, y_test, rf)

C = 16
Train score: 0.60
Test score: 0.46

Classification report:
              precision    recall  f1-score   support

    negative       0.57      0.09      0.16        88
     neutral       0.45      0.97      0.62       105
     postive       0.00      0.00      0.00        47

    accuracy                           0.46       240
   macro avg       0.34      0.35      0.26       240
weighted avg       0.41      0.46      0.33       240

[[  8  80   0]
 [  3 102   0]
 [  3  44   0]]


In [141]:
# rf = RandomForestClassifier(max_depth = 16, n_estimators=10)
# rf

RandomForestClassifier(max_depth=16, n_estimators=10)

In [142]:
# scores = cross_val_score(rf, X_train, y_train, cv=5)
# print("{}\n{:.3f}, {:.3f}".format(scores, scores.mean(), scores.std()))

[0.45535714 0.41964286 0.41071429 0.42857143 0.40540541]
0.424, 0.018


In [143]:
# rf = train_test(X_train, X_test, y_train, y_test, rf)

Train score: 0.61
Test score: 0.42

Classification report:
              precision    recall  f1-score   support

    negative       0.41      0.14      0.21        88
     neutral       0.43      0.85      0.57       105
     postive       0.00      0.00      0.00        47

    accuracy                           0.42       240
   macro avg       0.28      0.33      0.26       240
weighted avg       0.34      0.42      0.32       240

[[12 74  2]
 [14 89  2]
 [ 3 44  0]]


In [144]:
summary["Random Forest"] = round(rf.score(X_test, y_test), 3)

In [145]:
summary

{'Logistic Regression': 0.45,
 'Multinomial Naive Bayes': 0.454,
 'Linear SVMs': 0.45,
 'Kernelized SVMs': 0.458,
 'Neural Networks': 0.438,
 'k-NNs': 0.408,
 'Decision Tree': 0.396,
 'Random Forest': 0.421}

## New sentences

In [77]:
text1 = "This is amazing, climate change initiatives have created so many jobs!"
text2 = "I hate the bad idea of hotter temperatures and the horrible fact that ice caps are melting"
text3 = "Ice caps are melting faster each year"

In [78]:
new_texts = [text1, text2, text3]
X_new = vectorizer.transform(new_texts)

In [79]:
lr.predict(X_new)

array(['neutral', 'neutral', 'neutral'], dtype=object)

In [80]:
mnb.predict(X_new)

array(['neutral', 'negative', 'neutral'], dtype='<U8')

In [81]:
svm.predict(X_new)

array(['neutral', 'negative', 'neutral'], dtype=object)

In [82]:
ksvm.predict(X_new)

array(['neutral', 'negative', 'neutral'], dtype=object)

In [83]:
mlp.predict(X_new)

array(['postive', 'negative', 'neutral'], dtype='<U8')

In [120]:
dt.predict(X_new)

array(['postive', 'negative', 'negative'], dtype=object)

In [128]:
rf.predict(X_new)

array(['neutral', 'negative', 'neutral'], dtype=object)