<a href="https://colab.research.google.com/github/atikhasan007/Natural-Language-Processing./blob/main/%20Latent%20Semantic%20Analysis%20LSA%2C%20Truncated%20SVD%20Using%20Sklearn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# TruncatedSVD (LSA)
# TruncatedSVD reduces high-dimensional TF-IDF vectors to low-dimensional semantic concepts.

# Useful for: Clustering, visualization, topic modeling, and more.

In [2]:
#imports
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD #Singular Value Decomposition (SVD)
#TruncatedSVD হলো dimensionality reduction technique

In [3]:
#Dummy Datasets

documents = [
    "Machine learning is amazing",
    "Deep learning and neural networks are part of machine learning",
    "Natural language processing uses machine learning techniques",
    "AI includes machine learning and deep learning",
    "Language models are used in NLP",
    "NLP stands for natural language processing",
    "Transformers are powerful models for NLP",
    "Deep learning is a subfield of machine learning",
    "AI is the future of technology",
    "Technology is evolving with AI and ML"
]
documents


['Machine learning is amazing',
 'Deep learning and neural networks are part of machine learning',
 'Natural language processing uses machine learning techniques',
 'AI includes machine learning and deep learning',
 'Language models are used in NLP',
 'NLP stands for natural language processing',
 'Transformers are powerful models for NLP',
 'Deep learning is a subfield of machine learning',
 'AI is the future of technology',
 'Technology is evolving with AI and ML']

In [4]:
#vectorization
vectorizer = TfidfVectorizer(stop_words='english')
x_tfidf = vectorizer.fit_transform(documents)
x_tfidf


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 44 stored elements and shape (10, 24)>

In [6]:
#apply TruncatedSVD (LSA)
#reduce to 2 latent topics / components
svd = TruncatedSVD(n_components=3)
x_lsa = svd.fit_transform(x_tfidf)
x_lsa


array([[ 0.65956933, -0.09041784, -0.12827574],
       [ 0.78214146, -0.13828809, -0.1275628 ],
       [ 0.50239136,  0.45905735, -0.05369877],
       [ 0.82050161, -0.16293862,  0.1350175 ],
       [ 0.08589637,  0.75004832,  0.09361561],
       [ 0.16308372,  0.75719271,  0.04364809],
       [ 0.03617842,  0.57171849,  0.09360164],
       [ 0.83275234, -0.14031668, -0.12579432],
       [ 0.11718935, -0.09569476,  0.84053105],
       [ 0.10557946, -0.09158977,  0.83617547]])

In [14]:
#display Results
terms = vectorizer.get_feature_names_out()

for idx , comp in enumerate(svd.components_):
    terms_in_comp = zip(terms,comp)
    sorted_terms = sorted(terms_in_comp, key=lambda x: x[1], reverse=True)[:5]


    print(f"\nTopic {idx + 1}")
    for term, weight in sorted_terms:
      print(f"{term} ({weight:.4f})")





Topic 1
learning (0.7055)
machine (0.4330)
deep (0.3413)
amazing (0.1853)
subfield (0.1677)

Topic 2
nlp (0.4904)
language (0.4465)
models (0.3670)
natural (0.2955)
processing (0.2955)

Topic 3
technology (0.5796)
ai (0.5412)
future (0.3727)
evolving (0.3091)
ml (0.3091)
