In [1]:
import pandas as pd
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt


from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

from wordcloud import WordCloud, STOPWORDS


In [3]:
import os

base_dir = "../topic_model/test/"

all_docs = []

docs = os.listdir(base_dir)

for doc in docs:
    if not doc.startswith('.'): # get only the .txt files
        with open(base_dir + doc, "r") as file:
            text = file.read()
            all_docs.append(text)

# just take a look at the first item to be sure
all_docs[5]



In [4]:
#instantiate CountVectorizer()
cv=CountVectorizer(stop_words = 'english')
 
# this steps generates word counts for the words in your docs
word_count_vector=cv.fit_transform(all_docs)

# check shape
word_count_vector.shape

(106, 4445)

In [5]:
# and we can sort it like this:

sum_words = word_count_vector.sum(axis=0) # sum_words is a vector that contains
                                            # the sum of each word occurrence in all 
                                            # texts in the corpus. In other words, 
                                            # we are adding the elements for each column of
                                            # the word_count_vector matrix

# then sort the list of tuples that contain the word and their occurrence in the corpus.
words_freq = [(word, sum_words[0, idx]) for word, idx in cv.vocabulary_.items()]
words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)

# display the top 10
words_freq[:10]

[('li', 1386),
 ('school', 693),
 ('hong', 460),
 ('said', 446),
 ('kong', 414),
 ('nashville', 384),
 ('says', 362),
 ('10', 339),
 ('people', 328),
 ('middle', 276)]

In [None]:
# Call tfidf_transformer.fit on the word count vector we computed earlier.

tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

# print idf values
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names(),columns=["idf_weights"])
 
# sort ascending
df_idf.sort_values(by=['idf_weights'])

In [None]:
sorted_tfidf = df_idf.sort_values(by = 'idf_weights')
sorted_tfidf.head(30)

In [None]:
sorted_tfidf['word'] = sorted_tfidf.index
sorted_tfidf = sorted_tfidf[['word', 'idf_weights']].to_numpy()

d = {}
for a, x in sorted_tfidf:
    d[a] = 1/x


In [None]:
Cloud = WordCloud(width=1000, height=600, background_color="white", max_words=75).generate_from_frequencies(d)

# Plotting the WordCloud - tfidf version                    
plt.figure(figsize = (20, 20), facecolor = None) 
plt.imshow(Cloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show()

In [None]:
# count version
count_d = {}
for a, x in words_freq:
    count_d[a] = x
    
Cloud_count = WordCloud(width=1000, height=600, background_color="white", max_words=75).generate_from_frequencies(count_d)


# Plotting the WordCloud                    
plt.figure(figsize = (20, 20), facecolor = None) 
plt.imshow(Cloud_count) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show()

In [7]:
def tfidf_generator(base_dir):

    all_docs = []

    docs = os.listdir(base_dir)

    for doc in docs:
        if not doc.startswith('.'): # get only the .txt files
            with open(base_dir + doc, "r") as file:
                text = file.read()
                all_docs.append(text)
    #instantiate CountVectorizer()
    cv=CountVectorizer(stop_words = 'english')

    # this steps generates word counts for the words in your docs
    word_count_vector=cv.fit_transform(all_docs)

    # check shape
    word_count_vector.shape
    
    
    sum_words = word_count_vector.sum(axis=0) # sum_words is a vector that contains
                                            # the sum of each word occurrence in all 
                                            # texts in the corpus. In other words, 
                                            # we are adding the elements for each column of
                                            # the word_count_vector matrix

    # then sort the list of tuples that contain the word and their occurrence in the corpus.
    words_freq = [(word, sum_words[0, idx]) for word, idx in cv.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
    
    # Call tfidf_transformer.fit on the word count vector we computed earlier.

    tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
    tfidf_transformer.fit(word_count_vector)

    # print idf values
    df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names(),columns=["idf_weights"])

    # sort ascending
    df_idf.sort_values(by=['idf_weights'])
    
    sorted_tfidf = df_idf.sort_values(by = 'idf_weights')
    
    return sorted_tfidf

In [8]:
left_lean = tfidf_generator("../news_source_splits/left_leaning/")
center = tfidf_generator("../news_source_splits/center/")
right_lean = tfidf_generator("../news_source_splits/right_leaning/")

In [9]:
left_lean.head(30)

Unnamed: 0,idf_weights
said,1.209993
coronavirus,1.257701
people,1.400588
new,1.461218
pandemic,1.483609
time,1.49342
covid,1.499422
19,1.509285
health,1.676122
just,1.707378
