# Latent Dirichlet Allocation

In [None]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from google.colab import drive

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
data = pd.read_csv("/content/gdrive/MyDrive/DSM COURSE/NOTEBOOK/NLP/news-article-categories.csv", encoding = 'latin1')
print(data.head())

         category                                              title  \
0  ARTS & CULTURE  Modeling Agencies Enabled Sexual Predators For...   
1  ARTS & CULTURE  Actor Jeff Hiller Talks âBright Colors And B...   
2  ARTS & CULTURE  New Yorker Cover Puts Trump 'In The Hole' Afte...   
3  ARTS & CULTURE  Man Surprises Girlfriend By Drawing Them In Di...   
4  ARTS & CULTURE  This Artist Gives Renaissance-Style Sculptures...   

                                                body  
0  In October 2017, Carolyn Kramer received a dis...  
1  This week I talked with actor Jeff Hiller abou...  
2  The New Yorker is taking on President Donald T...  
3  Kellen Hickey, a 26-year-old who lives in Huds...  
4  Thereâs something about combining the tradit...  


In [None]:
def preprocess_text(text):
    # Check if input is not a string, return an empty string
    if not isinstance(text, str):
        return ""

    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize text
    tokens = nltk.word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize tokens
    lemma = WordNetLemmatizer()
    tokens = [lemma.lemmatize(word) for word in tokens]
    # Join tokens to form preprocessed text
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

data['Article'] = data['body'].apply(preprocess_text)

In [None]:
vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(data['Article'].values)

In [None]:
lda = LatentDirichletAllocation(n_components=2, random_state=42)
lda.fit(x)

topic_modelling = lda.transform(x)

topic_labels = np.argmax(topic_modelling, axis=1)
data['topic_labels'] = topic_labels

In [None]:
data.topic_labels.unique()

array([0, 1])

In [None]:
data.head()

Unnamed: 0,text,cleaned_text,topic_labels
0,I love pizza and pasta.,love pizza pasta.,0
1,Pasta is an Italian dish loved by many.,pasta italian dish loved many.,0
2,Basketball and football are popular sports.,basketball football popular sports.,1
3,I enjoy watching sports like basketball.,enjoy watching sports like basketball.,1
4,Pizza and sports make for a great party!,pizza sports make great party!,0


# Latent Semantic Analysis

In [None]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from google.colab import drive

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
data = pd.read_csv("/content/gdrive/MyDrive/DSM COURSE/NOTEBOOK/NLP/news-article-categories.csv", encoding = 'latin1')
print(data.head())

         category                                              title  \
0  ARTS & CULTURE  Modeling Agencies Enabled Sexual Predators For...   
1  ARTS & CULTURE  Actor Jeff Hiller Talks âBright Colors And B...   
2  ARTS & CULTURE  New Yorker Cover Puts Trump 'In The Hole' Afte...   
3  ARTS & CULTURE  Man Surprises Girlfriend By Drawing Them In Di...   
4  ARTS & CULTURE  This Artist Gives Renaissance-Style Sculptures...   

                                                body  
0  In October 2017, Carolyn Kramer received a dis...  
1  This week I talked with actor Jeff Hiller abou...  
2  The New Yorker is taking on President Donald T...  
3  Kellen Hickey, a 26-year-old who lives in Huds...  
4  Thereâs something about combining the tradit...  


In [None]:
def preprocess_text(text):
    # Check if input is not a string, return an empty string
    if not isinstance(text, str):
        return ""

    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize text
    tokens = nltk.word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize tokens
    lemma = WordNetLemmatizer()
    tokens = [lemma.lemmatize(word) for word in tokens]
    lemma = WordNetLemmatize()
    # Join tokens to form preprocessed text
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

data['Article'] = data['body'].apply(preprocess_text)

In [None]:
vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(data['Article'].values)

In [None]:
lsa = TruncatedSVD(n_components=2, random_state=42)# 2 topics
lsa.fit(x)

topic_modelling = lsa.transform(x)

topic_labels = np.argmax(topic_modelling, axis=1)
data['topic_labels'] = topic_labels

In [None]:
data.head()

Unnamed: 0,category,title,body,Article,topic_labels
0,ARTS & CULTURE,Modeling Agencies Enabled Sexual Predators For...,"In October 2017, Carolyn Kramer received a dis...",october 2017 carolyn kramer received disturbin...,0
1,ARTS & CULTURE,Actor Jeff Hiller Talks âBright Colors And B...,This week I talked with actor Jeff Hiller abou...,week talked actor jeff hiller hit broadway pla...,0
2,ARTS & CULTURE,New Yorker Cover Puts Trump 'In The Hole' Afte...,The New Yorker is taking on President Donald T...,new yorker taking president donald trump asked...,0
3,ARTS & CULTURE,Man Surprises Girlfriend By Drawing Them In Di...,"Kellen Hickey, a 26-year-old who lives in Huds...",kellen hickey 26yearold life hudson wisconsin ...,0
4,ARTS & CULTURE,This Artist Gives Renaissance-Style Sculptures...,Thereâs something about combining the tradit...,thereâs something combining traditional upti...,0
