## Practical 5: -Implement LSA and Topic model.

In [12]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from sklearn.decomposition import TruncatedSVD
# If nltk stop word is not downloaded
nltk.download('stopwords')
from nltk.corpus import stopwords


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADITI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## List of Sentences

In [13]:
a1 = "I made cheese at home."
a2 = "I like home made cheese."
a3 = "Cheese made at home is tasty."
a4 = "I like cheese that is salty."
a5 = "I like cheese made at home."

df = pd.DataFrame()
df["documents"] = [a1,a2,a3,a4,a5]
df.head()


Unnamed: 0,documents
0,I made cheese at home.
1,I like home made cheese.
2,Cheese made at home is tasty.
3,I like cheese that is salty.
4,I like cheese made at home.


## Text Preprocessing

In [14]:
df['clean_documents'] = df['documents'].str.replace("[^a-zA-Z#]", " ")
df['clean_documents'] = df['clean_documents'].fillna('').apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))
df['clean_documents'] = df['clean_documents'].fillna('').apply(lambda x: x.lower())

df.head()

  df['clean_documents'] = df['documents'].str.replace("[^a-zA-Z#]", " ")


Unnamed: 0,documents,clean_documents
0,I made cheese at home.,made cheese home
1,I like home made cheese.,like home made cheese
2,Cheese made at home is tasty.,cheese made home tasty
3,I like cheese that is salty.,like cheese that salty
4,I like cheese made at home.,like cheese made home


## Removing StopWords

In [15]:
import nltk 
nltk.download('stopwords') 
from nltk.corpus import stopwords 
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADITI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Tokenizing the Sentences

In [16]:
tokenized_doc = df['clean_documents'].fillna('').apply(lambda x: x.split())

tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words]) 
# de-tokenization
detokenized_doc = []
for i in range(len(df)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)

df['clean_documents'] = detokenized_doc


In [17]:
df.head()

Unnamed: 0,documents,clean_documents
0,I made cheese at home.,made cheese home
1,I like home made cheese.,like home made cheese
2,Cheese made at home is tasty.,cheese made home tasty
3,I like cheese that is salty.,like cheese salty
4,I like cheese made at home.,like cheese made home


## TF - IDF Vector

In [18]:
# TF-IDF vector
vectorizer = TfidfVectorizer(stop_words='english', smooth_idf=True)
X = vectorizer.fit_transform(df['clean_documents'])
X.toarray()


array([[0.64578193, 0.7635219 , 0.        , 0.        , 0.        ],
       [0.47818893, 0.56537308, 0.67207785, 0.        , 0.        ],
       [0.38342448, 0.45333103, 0.        , 0.        , 0.80465933],
       [0.36811741, 0.        , 0.51737618, 0.77253573, 0.        ],
       [0.47818893, 0.56537308, 0.67207785, 0.        , 0.        ]])

In [19]:
X.shape 
#A56   U(5,5). S()

(5, 5)

In [20]:
# SVD represent documents and terms in vectors 
svd_model = TruncatedSVD(n_components=2, algorithm='randomized', n_iter=100, random_state=122)
lsa = svd_model.fit_transform(X)


In [21]:
#Documents - Topic vector
pd.options.display.float_format = '{:,.16f}'.format
topic_encoded_df = pd.DataFrame(lsa, columns = ["topic_1", "topic_2"])
topic_encoded_df["documents"] = df['clean_documents']
display(topic_encoded_df[["documents", "topic_1", "topic_2"]])


Unnamed: 0,documents,topic_1,topic_2
0,made cheese home,0.8494487055384483,-0.3259325837951392
1,like home made cheese,0.9539070278709918,0.1308116933912606
2,cheese made home tasty,0.6292275241977919,-0.5861192937481611
3,like cheese salty,0.5701597701726294,0.6947193346526543
4,like cheese made home,0.9539070278709918,0.1308116933912606


In [22]:
# Features or words used as features 
dictionary = vectorizer.get_feature_names()

In [23]:
dictionary

['cheese', 'home', 'like', 'salty', 'tasty']

In [24]:
# Term-Topic matrix
encoding_matrix = pd.DataFrame(svd_model.components_, index = ["topic_1","topic_2"], columns = (dictionary)).T


In [25]:
encoding_matrix

Unnamed: 0,topic_1,topic_2
cheese,0.586063017024338,-0.0562473562207643
home,0.6168517185743644,-0.379306810706278
like,0.4834363012966142,0.5537430136607275
salty,0.1350116704597163,0.555225477239354
tasty,0.1551943581336322,-0.4879097483308507
