In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans

In [14]:
df = pd.read_csv("AP_JJ_SAsentiments.csv")

In [15]:
df.head()

Unnamed: 0,sentence,sentiment
0,It also increases carbon dioxide emissions whi...,neutral
1,We can already see this happening.\t,negative
2,The ecological disaster is a consequence of no...,positive
3,We may be dealing with an issue with a level o...,negative
4,Preventable chronic diseases are Australiaâ€™s...,negative


In [16]:
df.sentiment.value_counts()

neutral     1250
negative     681
positive     469
Name: sentiment, dtype: int64

In [17]:
vectorizer = TfidfVectorizer(use_idf=True, norm="l2", stop_words="english", max_df=0.7)
X = vectorizer.fit_transform(df.sentence)
y = df.sentiment

In [18]:
X.shape, y.shape

((2400, 8186), (2400,))

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

## K-NN

In [20]:
score_max = 0                      # Score_max is a temoporay variable to store the max score 
for param in [1, 3, 10, 30]:
    model = KNeighborsClassifier(n_neighbors=param)
    scores = cross_val_score(model, X_train, y_train, cv=5)
    print("k = {}: {}\n{:.3f}, {:.3f}\n".format(param, scores, scores.mean(), scores.std()))
    
    if scores.mean() > score_max:
        score_max = scores.mean()
        param_best = param         # Param_best is a temoporay variable to store the best parameter 
        
print("Highest score : {:.3f} when k = {}".format(score_max, param_best))

k = 1: [0.51785714 0.51488095 0.52380952 0.51785714 0.52380952]
0.520, 0.004

k = 3: [0.29464286 0.52380952 0.52380952 0.52678571 0.52678571]
0.479, 0.092

k = 10: [0.48511905 0.49404762 0.52380952 0.52380952 0.38095238]
0.482, 0.053

k = 30: [0.51785714 0.52083333 0.52380952 0.5297619  0.47619048]
0.514, 0.019

Highest score : 0.520 when k = 1


In [21]:
def train_test(X_train, X_test, y_train, y_test, classifier):
    classifier.fit(X_train, y_train)
    pred = classifier.predict(X_test)
    
    print("Train score: {:.2f}".format(classifier.score(X_train, y_train)))
    print("Test score: {:.2f}\n".format(classifier.score(X_test, y_test)))
    print("Classification report:\n{}".format(classification_report(y_test, pred, zero_division=0)))
    print(confusion_matrix(y_test,pred))
    
    return classifier

In [22]:
print("k = {}".format(param_best))
knn = KNeighborsClassifier(n_neighbors=param_best)
knn = train_test(X_train, X_test, y_train, y_test, knn)

k = 1
Train score: 1.00
Test score: 0.51

Classification report:
              precision    recall  f1-score   support

    negative       0.29      0.01      0.02       218
     neutral       0.51      0.98      0.67       368
    positive       0.00      0.00      0.00       134

    accuracy                           0.51       720
   macro avg       0.27      0.33      0.23       720
weighted avg       0.35      0.51      0.35       720

[[  2 214   2]
 [  4 362   2]
 [  1 133   0]]


In [23]:
summary = {}
summary["k-NNs"] = round(knn.score(X_test, y_test), 3)

## Logistic Regression

In [24]:
lr = LogisticRegression()

In [25]:
scores = cross_val_score(lr, X_train, y_train, cv=5)
print("{}\n{:.3f}, {:.3f}".format(scores, scores.mean(), scores.std()))

[0.54464286 0.53571429 0.54464286 0.55654762 0.53869048]
0.544, 0.007


In [26]:
lr = train_test(X_train, X_test, y_train, y_test, lr)

Train score: 0.83
Test score: 0.53

Classification report:
              precision    recall  f1-score   support

    negative       0.54      0.13      0.21       218
     neutral       0.53      0.95      0.68       368
    positive       0.22      0.01      0.03       134

    accuracy                           0.53       720
   macro avg       0.43      0.36      0.30       720
weighted avg       0.47      0.53      0.41       720

[[ 28 184   6]
 [ 19 348   1]
 [  5 127   2]]


In [27]:
summary["Logistic Regression"] = round(lr.score(X_test, y_test), 3)

## Multinomial NB

In [28]:
mnb = MultinomialNB()
mnb

MultinomialNB()

In [29]:
scores = cross_val_score(mnb, X_train, y_train, cv=5)
print("{}\n{:.3f}, {:.3f}".format(scores, scores.mean(), scores.std()))

[0.52380952 0.53869048 0.5297619  0.54761905 0.54166667]
0.536, 0.009


In [30]:
mnb = train_test(X_train, X_test, y_train, y_test, mnb)

Train score: 0.70
Test score: 0.53

Classification report:
              precision    recall  f1-score   support

    negative       0.92      0.06      0.10       218
     neutral       0.52      1.00      0.68       368
    positive       0.00      0.00      0.00       134

    accuracy                           0.53       720
   macro avg       0.48      0.35      0.26       720
weighted avg       0.54      0.53      0.38       720

[[ 12 206   0]
 [  1 367   0]
 [  0 134   0]]


In [31]:
summary["Multinomial Naive Bayes"] = round(mnb.score(X_test, y_test), 3)

## Modeling with Linear Support Vector Machines (SVMs)

In [32]:
svm = LinearSVC(C=1)
svm

LinearSVC(C=1)

In [33]:
svm = train_test(X_train, X_test, y_train, y_test, svm)

Train score: 0.99
Test score: 0.50

Classification report:
              precision    recall  f1-score   support

    negative       0.48      0.27      0.34       218
     neutral       0.54      0.78      0.63       368
    positive       0.24      0.11      0.15       134

    accuracy                           0.50       720
   macro avg       0.42      0.39      0.38       720
weighted avg       0.46      0.50      0.46       720

[[ 58 143  17]
 [ 50 287  31]
 [ 13 106  15]]


In [34]:
score_max = 0
for param in [0.01, 0.03, 0.1, 0.3, 1, 3, 10]:
    model = LinearSVC(C=param)
    scores = cross_val_score(model, X_train, y_train, cv=5)
    print("C = {}: {}\n{:.3f}, {:.3f}\n".format(param, scores, scores.mean(), scores.std()))
    
    if scores.mean() > score_max:
        score_max = scores.mean()
        param_best = param
        
print("Highest score : {:.3f} when C = {}".format(score_max, param_best))

C = 0.01: [0.52380952 0.52380952 0.52380952 0.52678571 0.52678571]
0.525, 0.001

C = 0.03: [0.52380952 0.52380952 0.52380952 0.52678571 0.52678571]
0.525, 0.001

C = 0.1: [0.5297619  0.53869048 0.53571429 0.53869048 0.53571429]
0.536, 0.003

C = 0.3: [0.54761905 0.55654762 0.54464286 0.57440476 0.52678571]
0.550, 0.016

C = 1: [0.5327381  0.5297619  0.55357143 0.56845238 0.50892857]
0.539, 0.021

C = 3: [0.53571429 0.52678571 0.51190476 0.55952381 0.50892857]
0.529, 0.018

C = 10: [0.51785714 0.53571429 0.51785714 0.54166667 0.48214286]
0.519, 0.021

Highest score : 0.550 when C = 0.3


In [35]:
print("C = {}".format(param_best))
svm = LinearSVC(C=param_best)
svm = train_test(X_train, X_test, y_train, y_test, svm)

C = 0.3
Train score: 0.95
Test score: 0.53

Classification report:
              precision    recall  f1-score   support

    negative       0.53      0.19      0.28       218
     neutral       0.54      0.90      0.67       368
    positive       0.20      0.03      0.05       134

    accuracy                           0.53       720
   macro avg       0.42      0.38      0.34       720
weighted avg       0.47      0.53      0.44       720

[[ 42 167   9]
 [ 28 333   7]
 [  9 121   4]]


In [36]:
summary["Linear SVMs"] = round(svm.score(X_test, y_test), 3)

## Modeling with Kernelized Support Vector Machines (KSVMs)

In [37]:
ksvm = SVC(C=1, kernel="rbf", gamma="scale")
ksvm

SVC(C=1)

In [38]:
ksvm = train_test(X_train, X_test, y_train, y_test, ksvm)

Train score: 0.99
Test score: 0.51

Classification report:
              precision    recall  f1-score   support

    negative       0.56      0.02      0.04       218
     neutral       0.51      0.99      0.68       368
    positive       0.00      0.00      0.00       134

    accuracy                           0.51       720
   macro avg       0.36      0.34      0.24       720
weighted avg       0.43      0.51      0.36       720

[[  5 212   1]
 [  3 365   0]
 [  1 133   0]]


In [39]:
score_max = 0
for param in [0.01, 0.03, 0.1, 0.3, 1, 3, 10]:
    model = SVC(C=param, kernel="rbf", gamma="scale")
    scores = cross_val_score(model, X_train, y_train, cv=5)
    print("C = {}: {}\n{:.3f}, {:.3f}\n".format(param, scores, scores.mean(), scores.std()))
    
    if scores.mean() > score_max:
        score_max = scores.mean()
        param_best = param
        
print("Highest score : {:.3f} when C = {}".format(score_max, param_best))

C = 0.01: [0.52380952 0.52380952 0.52380952 0.52678571 0.52678571]
0.525, 0.001

C = 0.03: [0.52380952 0.52380952 0.52380952 0.52678571 0.52678571]
0.525, 0.001

C = 0.1: [0.52380952 0.52380952 0.52380952 0.52678571 0.52678571]
0.525, 0.001

C = 0.3: [0.52380952 0.52380952 0.52380952 0.52678571 0.52678571]
0.525, 0.001

C = 1: [0.52380952 0.52678571 0.52678571 0.5327381  0.5297619 ]
0.528, 0.003

C = 3: [0.53869048 0.54464286 0.54166667 0.55059524 0.53571429]
0.542, 0.005

C = 10: [0.53869048 0.54464286 0.54166667 0.55059524 0.53571429]
0.542, 0.005

Highest score : 0.542 when C = 3


In [40]:
print("C = {}".format(param_best))
ksvm = SVC(C=param_best)
ksvm = train_test(X_train, X_test, y_train, y_test, ksvm)

C = 3
Train score: 1.00
Test score: 0.52

Classification report:
              precision    recall  f1-score   support

    negative       0.53      0.13      0.21       218
     neutral       0.53      0.94      0.68       368
    positive       0.18      0.01      0.03       134

    accuracy                           0.52       720
   macro avg       0.41      0.36      0.30       720
weighted avg       0.46      0.52      0.41       720

[[ 28 184   6]
 [ 19 346   3]
 [  6 126   2]]


In [41]:
summary["Kernelized SVMs"] = round(ksvm.score(X_test, y_test), 3)

## Modeling with Neural Networks

In [64]:
mlp = MLPClassifier(hidden_layer_sizes=(10, ), activation="relu", random_state=0)
mlp

MLPClassifier(hidden_layer_sizes=(10,), random_state=0)

In [65]:
mlp = train_test(X_train, X_test, y_train, y_test, mlp)

Train score: 1.00
Test score: 0.44

Classification report:
              precision    recall  f1-score   support

    negative       0.40      0.39      0.40        88
     neutral       0.47      0.62      0.53       105
     postive       0.35      0.13      0.19        47

    accuracy                           0.44       240
   macro avg       0.41      0.38      0.37       240
weighted avg       0.42      0.44      0.41       240

[[34 49  5]
 [34 65  6]
 [16 25  6]]




In [66]:
score_max = 0
for param in [10, 30, 100]:
    model = MLPClassifier(hidden_layer_sizes=(param, ), activation="relu", random_state=0)
    scores = cross_val_score(model, X_train, y_train, cv=5)
    print("hidden_layer_size = {}: {}\n{:.3f}, {:.3f}\n".format(param, scores, scores.mean(), scores.std()))
    
    if scores.mean() > score_max:
        score_max = scores.mean()
        param_best = param
        
print("Highest score : {:.3f} when hidden_layer_sizes = {}".format(score_max, param_best))



hidden_layer_size = 10: [0.5        0.47321429 0.30357143 0.42857143 0.45045045]
0.431, 0.068





hidden_layer_size = 30: [0.44642857 0.44642857 0.32142857 0.41964286 0.46846847]
0.420, 0.052

hidden_layer_size = 100: [0.46428571 0.45535714 0.33928571 0.41964286 0.45045045]
0.426, 0.046

Highest score : 0.431 when hidden_layer_sizes = 10


In [67]:
print("hidden_layer_size = {}".format(param_best))
mlp = MLPClassifier(hidden_layer_sizes=(param_best, ), random_state=0)
mlp = train_test(X_train, X_test, y_train, y_test, mlp)

hidden_layer_size = 10
Train score: 1.00
Test score: 0.44

Classification report:
              precision    recall  f1-score   support

    negative       0.40      0.39      0.40        88
     neutral       0.47      0.62      0.53       105
     postive       0.35      0.13      0.19        47

    accuracy                           0.44       240
   macro avg       0.41      0.38      0.37       240
weighted avg       0.42      0.44      0.41       240

[[34 49  5]
 [34 65  6]
 [16 25  6]]




In [68]:
summary["Neural Networks"] = round(mlp.score(X_test, y_test), 3)

## Decision Tree Classifier

In [42]:
dt = tree.DecisionTreeClassifier()
dt

DecisionTreeClassifier()

In [43]:
scores = cross_val_score(dt, X_train, y_train, cv=5)
print("{}\n{:.3f}, {:.3f}".format(scores, scores.mean(), scores.std()))

[0.49702381 0.49702381 0.5327381  0.52380952 0.49404762]
0.509, 0.016


In [44]:
dt = train_test(X_train, X_test, y_train, y_test, dt)

Train score: 1.00
Test score: 0.51

Classification report:
              precision    recall  f1-score   support

    negative       0.43      0.32      0.37       218
     neutral       0.58      0.74      0.65       368
    positive       0.24      0.17      0.20       134

    accuracy                           0.51       720
   macro avg       0.42      0.41      0.41       720
weighted avg       0.47      0.51      0.48       720

[[ 69 116  33]
 [ 58 272  38]
 [ 33  78  23]]


In [45]:
summary["Decision Tree"] = round(dt.score(X_test, y_test), 3)

## Random Forest

In [46]:
score_max = 0
for param in [1, 3, 10, 12, 14, 16, 18, 20]:
    model = RandomForestClassifier(max_depth=param)
    scores = cross_val_score(model, X_train, y_train, cv=5)
    print("C = {}: {}\n{:.3f}, {:.3f}\n".format(param, scores, scores.mean(), scores.std()))
    
    if scores.mean() > score_max:
        score_max = scores.mean()
        param_best = param
        
print("Highest score : {:.3f} when C = {}".format(score_max, param_best))

C = 1: [0.52380952 0.52380952 0.52380952 0.52678571 0.52678571]
0.525, 0.001

C = 3: [0.52380952 0.52380952 0.52380952 0.52678571 0.52678571]
0.525, 0.001

C = 10: [0.52678571 0.52380952 0.52380952 0.52678571 0.5297619 ]
0.526, 0.002

C = 12: [0.52380952 0.52380952 0.52380952 0.52678571 0.52678571]
0.525, 0.001

C = 14: [0.52678571 0.52380952 0.52380952 0.52678571 0.5327381 ]
0.527, 0.003

C = 16: [0.52380952 0.52380952 0.52380952 0.52678571 0.5297619 ]
0.526, 0.002

C = 18: [0.52380952 0.52083333 0.52380952 0.5297619  0.5297619 ]
0.526, 0.004

C = 20: [0.52678571 0.52678571 0.52678571 0.5297619  0.5297619 ]
0.528, 0.001

Highest score : 0.528 when C = 20


In [47]:
print("C = {}".format(param_best))
rf = RandomForestClassifier(max_depth=param_best)
rf = train_test(X_train, X_test, y_train, y_test, rf)

C = 20
Train score: 0.57
Test score: 0.51

Classification report:
              precision    recall  f1-score   support

    negative       0.40      0.01      0.02       218
     neutral       0.51      0.99      0.68       368
    positive       0.00      0.00      0.00       134

    accuracy                           0.51       720
   macro avg       0.30      0.33      0.23       720
weighted avg       0.38      0.51      0.35       720

[[  2 216   0]
 [  2 366   0]
 [  1 133   0]]


In [141]:
# rf = RandomForestClassifier(max_depth = 16, n_estimators=10)
# rf

RandomForestClassifier(max_depth=16, n_estimators=10)

In [142]:
# scores = cross_val_score(rf, X_train, y_train, cv=5)
# print("{}\n{:.3f}, {:.3f}".format(scores, scores.mean(), scores.std()))

[0.45535714 0.41964286 0.41071429 0.42857143 0.40540541]
0.424, 0.018


In [143]:
# rf = train_test(X_train, X_test, y_train, y_test, rf)

Train score: 0.61
Test score: 0.42

Classification report:
              precision    recall  f1-score   support

    negative       0.41      0.14      0.21        88
     neutral       0.43      0.85      0.57       105
     postive       0.00      0.00      0.00        47

    accuracy                           0.42       240
   macro avg       0.28      0.33      0.26       240
weighted avg       0.34      0.42      0.32       240

[[12 74  2]
 [14 89  2]
 [ 3 44  0]]


In [48]:
summary["Random Forest"] = round(rf.score(X_test, y_test), 3)

In [49]:
summary

{'k-NNs': 0.506,
 'Logistic Regression': 0.525,
 'Multinomial Naive Bayes': 0.526,
 'Linear SVMs': 0.526,
 'Kernelized SVMs': 0.522,
 'Decision Tree': 0.506,
 'Random Forest': 0.511}

## New sentences

In [50]:
text1 = "This is amazing, climate change initiatives have created so many jobs!"
text2 = "I hate the bad idea of hotter temperatures and the horrible fact that ice caps are melting"
text3 = "Ice caps are melting faster each year"

In [51]:
new_texts = [text1, text2, text3]
X_new = vectorizer.transform(new_texts)

In [52]:
lr.predict(X_new)

array(['neutral', 'neutral', 'neutral'], dtype=object)

In [53]:
mnb.predict(X_new)

array(['neutral', 'neutral', 'neutral'], dtype='<U8')

In [54]:
svm.predict(X_new)

array(['neutral', 'neutral', 'neutral'], dtype=object)

In [55]:
ksvm.predict(X_new)

array(['neutral', 'neutral', 'neutral'], dtype=object)

In [56]:
mlp.predict(X_new)

NameError: name 'mlp' is not defined

In [57]:
dt.predict(X_new)

array(['neutral', 'positive', 'neutral'], dtype=object)

In [58]:
rf.predict(X_new)

array(['neutral', 'neutral', 'neutral'], dtype=object)

## Clustering

### K-means

In [59]:
df = df.drop_duplicates(["sentence"], keep="first").copy()

In [60]:
from nltk.corpus import stopwords
import string 

global_stopwords = stopwords.words("english")
local_stopwords = [c for c in string.punctuation] +\
                  ['’', '``', '…', '...', "''", '‘', '“', '”', "'m", "'re", "'s", "'ve", 'amp', 'https', "n't", 'rt', 
                   'covid19', 'coronavirus', 'covid19…', 'covid']

In [61]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(use_idf=True, norm="l2", stop_words=global_stopwords+local_stopwords, max_df=0.7)
X = vectorizer.fit_transform(df.sentence)

In [62]:
X.shape

(2375, 8319)

In [63]:
k = 10
kmeans = KMeans(n_clusters = k, random_state=0)
kmeans

KMeans(n_clusters=10, random_state=0)

In [64]:
%time kmeans.fit(X)

CPU times: user 8.9 s, sys: 22.4 ms, total: 8.92 s
Wall time: 1.22 s


KMeans(n_clusters=10, random_state=0)

In [65]:
kmeans.cluster_centers_

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.00333934, 0.        , 0.00242854, ..., 0.        , 0.        ,
        0.        ],
       [0.00109347, 0.0004104 , 0.        , ..., 0.00083695, 0.00035438,
        0.        ],
       ...,
       [0.00333804, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [66]:
kmeans.cluster_centers_.shape

(10, 8319)

In [67]:
kmeans.labels_

array([3, 2, 4, ..., 3, 6, 0], dtype=int32)

In [68]:
df["cluster"] = kmeans.labels_

In [69]:
df[["sentence", "cluster"]]

Unnamed: 0,sentence,cluster
0,It also increases carbon dioxide emissions whi...,3
1,We can already see this happening.\t,2
2,The ecological disaster is a consequence of no...,4
3,We may be dealing with an issue with a level o...,2
4,Preventable chronic diseases are Australiaâ€™s...,0
...,...,...
2395,Earthâ€™s cornucopia of life has evolved over ...,0
2396,This is because they seem to be more effective...,2
2397,Are climate scientists saying,3
2398,[Understand new developments in science,6


In [70]:
df.cluster.value_counts()

2    989
4    251
3    236
5    229
0    225
1    140
6    121
8     72
9     59
7     53
Name: cluster, dtype: int64

In [71]:
counts = df.cluster.value_counts()

In [72]:
df[df.cluster == counts.idxmin()].sample(10, random_state=1)[["sentence", "cluster"]] #largest cluster

Unnamed: 0,sentence,cluster
1271,"For example, a failure in the water supply wil...",7
55,But the evidence on this is not clear and a re...,7
2170,I think if we were to pursue an â€œeither/orâ€...,7
1296,"More specifically, the first anniversaries of ...",7
1278,"At the same time, adaptation measures may bene...",7
1918,Research has also found that prolonged exposur...,7
1320,Climate change is the defining issue of our ti...,7
1461,"In other words, the waiting time for the recen...",7
1908,In time steps of 500 years,7
855,Iron ore and coal led the way.About the same t...,7


In [73]:
import nltk
df["words"] = df.sentence.apply(lambda x: nltk.word_tokenize(x))
df["tagged_words"] = df.words.apply(lambda x: nltk.pos_tag(x))

from collections import Counter

def get_counter(dataframe, stopwords=[]):
    counter = Counter()
    
    for l in dataframe.tagged_words:
        word_set = set()

        for t in l:
            word = t[0].lower()
            tag = t[1]

            if word not in stopwords:
                word_set.add(word)
            
        counter.update(word_set)
        
    return counter

In [54]:
counter_max = get_counter(df[df.cluster == counts.idxmax()], global_stopwords+local_stopwords)
counter_max.most_common(30)

[('example', 12),
 ('good', 9),
 ('may', 8),
 ('also', 7),
 ('much', 6),
 ('use', 6),
 ('areas', 5),
 ('problem', 5),
 ('view', 5),
 ('world', 5),
 ('provide', 5),
 ('federal', 4),
 ('australia', 4),
 ('know', 4),
 ('whether', 4),
 ('sea', 4),
 ('summer', 4),
 ('many', 4),
 ('could', 4),
 ('means', 4),
 ('canâ€™t', 4),
 ('people', 4),
 ('year', 4),
 ('like', 4),
 ('would', 4),
 ('made', 4),
 ('health', 3),
 ('conditions', 3),
 ('quickly', 3),
 ('since', 3)]

In [74]:
counter_min = get_counter(df[df.cluster == counts.idxmin()], global_stopwords+local_stopwords)
counter_min.most_common(30)

[('time', 49),
 ('climate', 7),
 ('change', 6),
 ('people', 5),
 ('first', 5),
 ('may', 4),
 ('years', 4),
 ('others', 3),
 ('look', 3),
 ('communities', 3),
 ('found', 3),
 ('high', 3),
 ('around', 3),
 ('â€', 3),
 ('carbon', 3),
 ('likely', 3),
 ('increased', 3),
 ('events', 2),
 ('challenges', 2),
 ('immediate', 2),
 ('still', 2),
 ('buildings', 2),
 ('whether', 2),
 ('environment', 2),
 ('clear', 2),
 ('effect', 2),
 ('fixed', 2),
 ('followed', 2),
 ('rainfall', 2),
 ('period', 2)]

### LDA Topic Modeling

In [75]:
global_stopwords = stopwords.words("english")
local_stopwords = [c for c in string.punctuation] +\
                  ['’', '``', '…', '...', "''", '‘', '“', '”', "'m", "'re", "'s", "'ve", 'amp', 'https', "n't", 'rt', 
                   'covid19', 'coronavirus', 'covid19…', 'covid', 'co', 'cases']

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(use_idf=True, norm="l2", stop_words=global_stopwords+local_stopwords, max_df=0.7)
X = vectorizer.fit_transform(df.sentence)

In [76]:
num_topics = 3

In [77]:
from sklearn.decomposition import LatentDirichletAllocation as LDA

lda = LDA(n_components=num_topics, random_state=0)     # LDA uses randomness to get a probability distribution
lda

LatentDirichletAllocation(n_components=3, random_state=0)

In [78]:
%time lda.fit(X)

CPU times: user 3.8 s, sys: 0 ns, total: 3.8 s
Wall time: 3.81 s


LatentDirichletAllocation(n_components=3, random_state=0)

In [79]:
lda.components_

array([[2.32372288, 0.33490431, 0.33453344, ..., 1.15679968, 0.33415654,
        0.33405415],
       [0.92142793, 0.736381  , 0.33416588, ..., 0.33545675, 0.68191458,
        0.5974671 ],
       [1.16130358, 0.3346012 , 0.67129594, ..., 0.33548286, 0.33441315,
        0.33412453]])

In [80]:
lda.components_

array([[2.32372288, 0.33490431, 0.33453344, ..., 1.15679968, 0.33415654,
        0.33405415],
       [0.92142793, 0.736381  , 0.33416588, ..., 0.33545675, 0.68191458,
        0.5974671 ],
       [1.16130358, 0.3346012 , 0.67129594, ..., 0.33548286, 0.33441315,
        0.33412453]])

In [81]:
lda.components_.shape

(3, 8317)

In [82]:
def show_topics(model, feature_names, num_top_words):
    for topic_idx, topic_scores in enumerate(model.components_):
        print("***Topic {}:".format(topic_idx))
        print(" + ".join(["{:.2f} * {}".format(topic_scores[i], feature_names[i]) for i in topic_scores.argsort()[::-1][:num_top_words]]))
        print()

In [83]:
show_topics(lda, vectorizer.get_feature_names(), 10)

***Topic 0:
18.74 * climate + 14.35 * change + 12.48 * global + 10.34 * carbon + 9.03 * years + 9.01 * year + 8.93 * people + 8.60 * emissions + 8.57 * warming + 8.31 * many

***Topic 1:
23.11 * climate + 15.79 * change + 11.90 * ice + 9.19 * time + 8.44 * ocean + 8.21 * also + 7.88 * sea + 7.60 * world + 6.87 * may + 6.84 * emissions

***Topic 2:
11.51 * climate + 7.93 * change + 7.82 * carbon + 7.13 * one + 6.63 * water + 6.57 * species + 6.26 * could + 6.25 * energy + 6.22 * also + 6.21 * research



In [84]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [85]:
pyLDAvis.sklearn.prepare(lda, X, vectorizer)

  and should_run_async(code)
