## Import library

In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import string

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
dataset = pd.read_csv('./dataset/articles.csv', encoding='latin1')
dataset.head()

Unnamed: 0,Article,Title
0,Data analysis is the process of inspecting and...,Best Books to Learn Data Analysis
1,The performance of a machine learning algorith...,Assumptions of Machine Learning Algorithms
2,You must have seen the news divided into categ...,News Classification with Machine Learning
3,When there are only two classes in a classific...,Multiclass Classification Algorithms in Machin...
4,The Multinomial Naive Bayes is one of the vari...,Multinomial Naive Bayes in Machine Learning


## Data cleaning

In [6]:
dataset.isnull().sum()

Article    0
Title      0
dtype: int64

In [8]:
dataset.duplicated().sum()

1

In [9]:
dataset.drop_duplicates(inplace=True)

## Data preprocessing

In [11]:
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

In [13]:
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = nltk.word_tokenize(text)
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word not in stop_words]
    lemma = WordNetLemmatizer()
    tokens = [lemma.lemmatize(word) for word in tokens]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

dataset['Article'] = dataset['Article'].apply(preprocess_text)

## Data transformation

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [15]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(dataset['Article'].values)

## Topic Modeling

In [17]:
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(X)

topic_modelling = lda.transform(X)

topic_labels = np.argmax(topic_modelling, axis=1)
dataset['topic_labels'] = topic_labels

In [18]:
dataset.head(10)

Unnamed: 0,Article,Title,topic_labels
0,data analysis process inspecting exploring dat...,Best Books to Learn Data Analysis,2
1,performance machine learning algorithm particu...,Assumptions of Machine Learning Algorithms,3
2,must seen news divided category go news websit...,News Classification with Machine Learning,1
3,two class classification problem problem binar...,Multiclass Classification Algorithms in Machin...,3
4,multinomial naive bayes one variant naive baye...,Multinomial Naive Bayes in Machine Learning,1
6,natural language processing nlp subfield artif...,Best Books to Learn NLP,1
7,using thirdparty application api manage functi...,Send Instagram Messages using Python,3
8,twitter one popular social medium apps people ...,Pfizer Vaccine Sentiment Analysis using Python,0
9,squid game currently one trending show netflix...,Squid Game Sentiment Analysis using Python,1
10,computer vision one field artificial intellige...,Best Books to Learn Computer Vision,4
