In [1]:
import pandas as pd

import string
import re

import nltk
from nltk.corpus import words
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer

---
### Data Loading

First let's read the dataset into a data frame and have a look what is there.

In [2]:
data = pd.read_csv('https://raw.githubusercontent.com/valentina-s/cse-stat-416-sp20/master/data/2020-04-30_Coronavirus_Tweets_small.csv')
data.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,text,is_quote,is_retweet,retweet_count,country_code,place_full_name,place_type,verified,lang
0,"Asegura sus beneficios, registra a tu esposa e...",False,False,0.0,,,,False,es
1,"#COVID19 | El Faro conversó con policías, un f...",False,False,11.0,,,,True,es
2,"Si ya era cuestionable la burocracia, lo es má...",False,False,1.0,,,,False,es
3,Las medidas de higiene ayudan a reducir la pro...,False,False,38.0,,,,True,es
4,Cubre tu nariz y boca al estornudar con el áng...,False,False,0.0,,,,False,es


---
### Text Preprocessing

First, we will do several text preprocessing steps. We will:
* limit to English language
* remove URL links
* make lower case
* remove pronunciation
* remove stopwords

In [3]:
# select tweets in English
text_en = data['text'][data['lang']=='en']

In [4]:
# remove URL links
text_en_lr = text_en.apply(lambda x: re.sub(r"https\S+", "", str(x)))

In [5]:
# make lower case
text_en_lr_lc = text_en_lr.apply(lambda x: x.lower())

In [6]:
# remove punctuation
text_en_lr_lc_pr = text_en_lr_lc.apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

In [7]:
# remove stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Akshat\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [8]:
stop_words = set(stopwords.words('english'))
stop_words.update(['#coronavirus', '#coronavirusoutbreak', '#coronavirusPandemic', '#covid19', '#covid_19', '#epitwitter', '#ihavecorona', 'amp', 'coronavirus', 'covid19','covid-19', 'covidー19'])

text_en_lr_lc_pr_sr = text_en_lr_lc_pr.apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

### TF-IDF Matrix


In [9]:
# create TF-IDF matrix
vectorizer = TfidfVectorizer(max_df=0.95)  # ignore words with very high doc frequency
tf_idf = vectorizer.fit_transform(text_en_lr_lc_pr_sr)

# exctract also the words so that we know which feature corresponds to which word
feature_names = vectorizer.get_feature_names()

In [14]:
tf_idf.shape

(198579, 255511)

### Non-negative Matrix Decomposition for Topic Discovery

Next we will use the [NMF](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html) method from `scikit-learn` to extract the topics.

In [11]:
from sklearn.decomposition import NMF

Set up an NMF model with 5 components. So that we all get the same results, please pass these parameters `init = 'nndsvd'` and `random_state = 1`.

In [12]:
# define an NMF model with `n_components = 5`
nmf = NMF(n_components=5, init='nndsvd', random_state=1)

In [15]:
nmf.fit(tf_idf)

NMF(alpha=0.0, beta_loss='frobenius', init='nndsvd', l1_ratio=0.0,
  max_iter=200, n_components=5, random_state=1, shuffle=False, solver='cd',
  tol=0.0001, verbose=0)

The topics are stored within the object `nmf.components_`. Now you can find the weight of each word within a topic. It will be interesting to look at the words corresponding to each topic ordered by their heighest weight. Remember the words corresponding to each topic are stored in `feature_names`, while the weights are stored in `nmf.components_`. You can use the [`argsort()`](https://numpy.org/doc/stable/reference/generated/numpy.argsort.html) function to extract the indeces of the sorted words. Note that `argsort` sorts from lowest to heighest so you need to look at the last values for the ones with heighest weights. You can reverse a list/array with `[::-1]`.

Find the maximum weight of a word in the first topic, and the word which corresponds to it.


In [59]:
HZ = nmf.components_
print("First Topic:")
# max weight in first topic
print(HZ[0,HZ[0,:].argsort()[-1]])
# index of max weight and the word associated with it
print(HZ[0,:].argsort()[-1])
print(feature_names[174646])

First Topic:
2.1784190593311634
174646
people


Create a function `words_from_topic` to extract an ordered list of words in a topic (highest weight first).

In [69]:
def words_from_topic(topic, feature_names):
    ordered_words = []
    arr = topic[1:].argsort()[::-1]
    for ind in arr:
        ordered_words.append(feature_names[ind])
    return(ordered_words)

In [70]:
def print_top_words(components, feature_names, n_top_words):
    """ 
    print_top_words prints the first n_top_words for each topic in components
    """
    for topic_idx, topic in enumerate(components):
        ordered_words = words_from_topic(topic, feature_names)
        message = "Topic #%d: " % (topic_idx+1)
        message += ", ".join(ordered_words[:n_top_words])
        print(message)

In [71]:
print_top_words(nmf.components_, feature_names, 10)

Topic #1: peopl, lockdowm, gesund, homdbs, stavtjoumakaris, likable, onduos, timdraper, knottycommander, daxx98
Topic #2: casero, nevsinmengu, deathresulting, tot, confirmatory, reportdomesticabuse, numbencore, positions, reportout, todate
Topic #3: helord, sprea, apoyototalalgobiernodemexico, selfreported, symptomquestion, downlights, sooncoronavirus, identifiying, slovensko, dailing
Topic #4: uryt, chin, joielovesjax, trumbull, lesvos, druditraj, sayrealdonaldtrump, milling, realdonaldt, deathresulting
Topic #5: pandemiamundial, healtcareheroes, suppor, helord, crisilresearch, workerrelief, gloating, newroselandhosp, reacts, responsable


Next let's look at a specific tweet and the individual contributions of the topics. For that we need to look at the coordinates of the transformed original tf-idf features. That can be obtained through `nmf.fit_transform` method. 

In [72]:
tweets_projected = nmf.fit_transform(tf_idf)

In [81]:
tweets_projected.shape

(198579, 5)

In [74]:
text_en_lr_lc_pr_sr.iloc[0]

'attention seattle shoppers grocery stores working hard keep employees customers safe part help slow spread ☑️ limit trips ☑️ respect special shopping hours ☑️ follow socialdistance guidance stores wegotthisseattle'

In [77]:
tweets_projected[0, 2]

0.025972237275222576

In [75]:
text_en_lr_lc_pr_sr.iloc[1]

'microsoft sees digital reboot pandemic profits'