In [1]:
# !pip install openpyxl
# !pip install gensim
# !pip install scikit-learn

In [2]:
import os
import re
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from collections import defaultdict
from gensim.corpora import dictionary 
from nltk.stem import WordNetLemmatizer
from gensim.models.ldamodel import LdaModel
from sklearn.cluster import MiniBatchKMeans
from gensim.models.coherencemodel import CoherenceModel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score

Task 1

In [3]:
df = pd.read_excel("NewsCategorizer.xlsx", usecols=["category", "short_description"])
df.head()

Unnamed: 0,category,short_description
0,WELLNESS,Resting is part of training. I've confirmed wh...
1,WELLNESS,Think of talking to yourself as a tool to coac...
2,WELLNESS,The clock is ticking for the United States to ...
3,WELLNESS,"If you want to be busy, keep trying to be perf..."
4,WELLNESS,"First, the bad news: Soda bread, corned beef a..."


In [4]:
df["category"].unique()

array(['WELLNESS', 'POLITICS', 'ENTERTAINMENT', 'TRAVEL',
       'STYLE & BEAUTY', 'PARENTING', 'FOOD & DRINK', 'WORLD NEWS',
       'BUSINESS', 'SPORTS'], dtype=object)

In [5]:
len(df)

50000

In [6]:
df.isna().count()

category             50000
short_description    50000
dtype: int64

Task 2

In [7]:
copy_df = df.copy()

In [8]:
# Applying on basic preprocessing:
# 1. cleaning: extracting alphabets and 
# splitting into word list for each row;
# 2. to lowercase; 3. removing stopwords;
# and 4. Lemmatization
def clean_text(sentence: str) -> list:
    """cleaning text by extracting alphabets and
    then splitting into word list for each sentence

    Args:
        sentence (str): sentence string

    Returns:
        list: list of words.
    """
    pattern = re.compile(r"[A-Za-z]+")
    return re.findall(pattern=pattern, string=sentence)

def to_lowercase(word_list: list) -> list:
    """case changing of all contents in each list
    of words

    Args:
        word_list (list): list of words with alphabets only texts

    Returns:
        word_list (list): list of words with lowercase transformation
    """
    temp_list = []
    for i, word in enumerate(word_list):
        temp_list.append(word.lower())
    word_list = temp_list[:]
    return word_list

def remove_stopwords(list_of_words: list) -> list:
    """removing stop words from list of words by matching
    English stop words and extracting those out 
    
    Args:
        list_of_words (list): list of words with stop words

    Returns:
        list: stop word free list of words
    """
    stopword_list = stopwords.words("english")

    return [word for word in list_of_words if word not in stopword_list]

def extract_word_lemma(list_of_words: list) -> list:
    """lemmatization of words from list of words
    using NLTK WordNetLemmatizer class

    Args:
        list_of_list_words (list): list of words inside a list

    Returns:
        list: lemmatized list of list of words
    """
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in list_of_words]

In [9]:
copy_df["clean_text"] = copy_df["short_description"].apply(lambda text: clean_text(text))
copy_df["clean_text"] = copy_df["clean_text"].apply(lambda word_list: to_lowercase(word_list))
copy_df["clean_text"] = copy_df["clean_text"].apply(lambda word_list: remove_stopwords(word_list))
copy_df["clean_text"] = copy_df["clean_text"].apply(lambda word_list: remove_stopwords(word_list))

In [10]:
copy_df.head()

Unnamed: 0,category,short_description,clean_text
0,WELLNESS,Resting is part of training. I've confirmed wh...,"[resting, part, training, confirmed, sort, alr..."
1,WELLNESS,Think of talking to yourself as a tool to coac...,"[think, talking, tool, coach, challenge, narra..."
2,WELLNESS,The clock is ticking for the United States to ...,"[clock, ticking, united, states, find, cure, t..."
3,WELLNESS,"If you want to be busy, keep trying to be perf...","[want, busy, keep, trying, perfect, want, happ..."
4,WELLNESS,"First, the bad news: Soda bread, corned beef a...","[first, bad, news, soda, bread, corned, beef, ..."


In [11]:
df = copy_df.copy()

Task 3

In [12]:
copy_df = df.copy()

In [13]:
# getting dictionary of indexes with corresponding
# word from "clean_text" column
word_dict = dictionary.Dictionary(copy_df["clean_text"])
for key, value in (word_dict.iteritems()):
    print("Key: ", key)
    print("Value: ", value)
    print()
    if key>10:
        break

Key:  0
Value:  already

Key:  1
Value:  also

Key:  2
Value:  built

Key:  3
Value:  confirmed

Key:  4
Value:  cross

Key:  5
Value:  days

Key:  6
Value:  five

Key:  7
Value:  foam

Key:  8
Value:  hard

Key:  9
Value:  knew

Key:  10
Value:  lots

Key:  11
Value:  part



In [14]:
# creating document to bag of words
copy_df["doc2bow"] = copy_df["clean_text"].apply(lambda word_list: word_dict.doc2bow(word_list))

In [15]:
copy_df["doc2bow"][0]

[(0, 1),
 (1, 1),
 (2, 2),
 (3, 2),
 (4, 1),
 (5, 1),
 (6, 1),
 (7, 1),
 (8, 1),
 (9, 1),
 (10, 1),
 (11, 1),
 (12, 1),
 (13, 1),
 (14, 1),
 (15, 1),
 (16, 1),
 (17, 1),
 (18, 1),
 (19, 1),
 (20, 1),
 (21, 2),
 (22, 1),
 (23, 1)]

In [16]:
# By K=10 means same as total number of topics
# available in our dataset
# and iterations = 200 means the parameter
# passes need to be set 200 to increase the
# probability
K = 10
iterations = 200
lda_model = LdaModel(
    corpus=copy_df["doc2bow"],
    id2word=word_dict,
    num_topics=K,
    passes=iterations
)

In [17]:
for topic in lda_model.print_topics():
    print(topic)

(0, '0.015*"like" + 0.014*"one" + 0.014*"time" + 0.013*"get" + 0.010*"make" + 0.010*"us" + 0.010*"way" + 0.009*"know" + 0.008*"good" + 0.008*"life"')
(1, '0.013*"check" + 0.011*"want" + 0.011*"style" + 0.011*"team" + 0.010*"government" + 0.010*"sure" + 0.006*"fans" + 0.006*"win" + 0.006*"economy" + 0.006*"games"')
(2, '0.009*"women" + 0.007*"school" + 0.007*"health" + 0.007*"found" + 0.007*"study" + 0.006*"market" + 0.006*"high" + 0.006*"university" + 0.006*"new" + 0.006*"business"')
(3, '0.010*"world" + 0.010*"people" + 0.008*"social" + 0.008*"one" + 0.007*"media" + 0.007*"facebook" + 0.007*"obama" + 0.007*"child" + 0.006*"killed" + 0.005*"president"')
(4, '0.023*"year" + 0.015*"years" + 0.014*"first" + 0.013*"two" + 0.012*"new" + 0.012*"last" + 0.011*"one" + 0.010*"old" + 0.009*"time" + 0.008*"world"')
(5, '0.011*"former" + 0.008*"said" + 0.008*"war" + 0.006*"financial" + 0.006*"party" + 0.006*"bowl" + 0.006*"super" + 0.005*"violence" + 0.005*"military" + 0.005*"deal"')
(6, '0.013*"f

In [18]:
df = copy_df.copy()

Task 4

In [19]:
copy_df = df.copy()

In [20]:
def calculate_tfidf(corpus: list) -> list:
    """calculating tf-idf of each list from given
    list of word corpus.

    from the list of words, calculate
    tf-idf matrix and return that matrix.

    Args:
        list_of_list_words (list): list of words

    Returns:
        list: each doc based top words after tf-idf calculation
    """
    # min_df = 2 is set to avoid 1 single string
    tfidf_vectorizer = TfidfVectorizer(min_df=2)
    return tfidf_vectorizer.fit_transform(corpus)

In [21]:
copy_df["tfidf_corpus"] = [" ".join(word_list) for word_list in copy_df["clean_text"]]
copy_df["tfidf_corpus"][:10]

0    resting part training confirmed sort already k...
1    think talking tool coach challenge narrate exp...
2    clock ticking united states find cure team wor...
3    want busy keep trying perfect want happy focus...
4    first bad news soda bread corned beef beer hig...
5    carey moss youbeauty com love rom coms love so...
6         nation general scored scale little bit score
7    also worth remembering water seaweed comes con...
8    look culture eating behavior certainly looks l...
9    fran ois marie arouet th century french author...
Name: tfidf_corpus, dtype: object

In [22]:
try:
    tfidf_matrix = calculate_tfidf(copy_df["tfidf_corpus"])
except Exception as e:
    print(e)

In [23]:
tfidf_arr = tfidf_matrix.toarray()

In [24]:
type(tfidf_arr)

numpy.ndarray

In [25]:
# kmeans = KMeans(n_clusters=10, max_iter=2, n_init=1)
kmeans = MiniBatchKMeans(n_clusters=10, max_iter=100, batch_size=8000)
kmeans.fit(tfidf_arr)

  super()._check_params_vs_input(X, default_n_init=3)


In [26]:
cluster_labels = kmeans.labels_

In [27]:
for i, label in enumerate(cluster_labels):
    print(f"Document {i+1} is in cluster label {label+1}")
    if i>15:
        break

Document 1 is in cluster label 8
Document 2 is in cluster label 10
Document 3 is in cluster label 10
Document 4 is in cluster label 10
Document 5 is in cluster label 7
Document 6 is in cluster label 3
Document 7 is in cluster label 10
Document 8 is in cluster label 10
Document 9 is in cluster label 1
Document 10 is in cluster label 10
Document 11 is in cluster label 7
Document 12 is in cluster label 10
Document 13 is in cluster label 7
Document 14 is in cluster label 6
Document 15 is in cluster label 10
Document 16 is in cluster label 10
Document 17 is in cluster label 10


Task 5

In [28]:
categories = df["category"].to_list()

In [29]:
categories[:10]

['WELLNESS',
 'WELLNESS',
 'WELLNESS',
 'WELLNESS',
 'WELLNESS',
 'WELLNESS',
 'WELLNESS',
 'WELLNESS',
 'WELLNESS',
 'WELLNESS']

In [30]:
ari_score_knn = adjusted_rand_score(categories, cluster_labels)
nmi_score_knn = normalized_mutual_info_score(categories, cluster_labels)

In [31]:
print(f"ARI Score: {ari_score_knn}")
print(f"NMI Score: {nmi_score_knn}")

ARI Score: 0.0029680657766448597
NMI Score: 0.01670519424450223


In [32]:
# calculating choerence score of 
# trained lda model
coherence_model_lda = CoherenceModel(
                                    model=lda_model,
                                    texts=copy_df["clean_text"],
                                    dictionary=word_dict,
                                    coherence="c_v")
coherence_score = coherence_model_lda.get_coherence()
coherence_score

0.35403221816380914

In [33]:
# now creating lda soft cluster
# for future reference constructing
# the list comprehension and commenting it
# lda_soft_clusters = max(lda_model[doc], key=lambda x: x[1])[0] for doc in copy_df["doc2bow"]
lda_categories = []
for i, doc in enumerate(copy_df["doc2bow"]):
    # generating topic with word proba.
    doc_topic = lda_model[doc]
    # finding out the dominant topic by inspecting
    # second element of the tuple which is probability
    # then if maximum proba. found, it is selected
    dominant_topic = max(doc_topic, key=lambda x: x[1])[0]
    lda_categories.append(dominant_topic)

In [34]:
# again calculating ari and nmi scores
ari_score_lda = adjusted_rand_score(categories, lda_categories)
nmi_score_lda = adjusted_rand_score(categories, lda_categories)

In [35]:
print(f"ARI Score: {ari_score_lda}")
print(f"NMI Score: {nmi_score_lda}")

ARI Score: 0.027586858783359265
NMI Score: 0.027586858783359265


Even thouh the coherence score is around: 0.3790 (approx), 
the LDA is not tuned in that way but it is still giving 
ARI and NMI score 0.0276 (approx) which is better 
than ARI score(0.0029) of KNN clustering (0.0167 approx) meaning, 
KNN is randomly assigning whereas LDA is finding very few but still some 
relations between the topics and the words to find the document to topics.