In [153]:
a1 = "Linear algebra is a useful lesson."
a2 = "Linear algebra and mathematics is basis of programming."
a3 = "Linear algebra is the branch of mathematics."
a4 = "Programming Python is easy!"
a5 = "Python is a high-level language."

<h2>Convert list of documents to DataFrame</h2>

In [154]:
import pandas as pd
import numpy as np
df = pd.DataFrame()
df["documents"] = [a1,a2,a3,a4,a5]

<h2>Preprocessing</h2>
<p>We’ll perform four types of processing on data:
    <ul>
        <li>Remove all the special characters from the text.</li>
        <li>Remove all the words with less than 3 letters.</li>
        <li>Lowercase all the characters.</li>
        <li>Remove stop words.</li>
    </ul>
</p>


In [155]:
#remove special characters
df['clean_documents'] = df['documents'].str.replace("[^a-zA-Z#]", " ")

#remove words have letters less than 3
df['clean_documents'] = df['clean_documents'].fillna('').apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))

#lowercase all characters
df['clean_documents'] = df['clean_documents'].fillna('').apply(lambda x: x.lower())

  df['clean_documents'] = df['documents'].str.replace("[^a-zA-Z#]", " ")


<h3>Removing Stop Words</h3>

In [156]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')# tokenization
tokenized_doc = df['clean_documents'].fillna('').apply(lambda x: x.split())# remove stop-words
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])# de-tokenization
detokenized_doc = []
for i in range(len(df)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)
df['clean_documents'] = detokenized_doc

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ali\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [157]:
df

Unnamed: 0,documents,clean_documents
0,Linear algebra is a useful lesson.,linear algebra useful lesson
1,Linear algebra and mathematics is basis of pro...,linear algebra mathematics basis programming
2,Linear algebra is the branch of mathematics.,linear algebra branch mathematics
3,Programming Python is easy!,programming python easy
4,Python is a high-level language.,python high level language


<h3>Document-Term matrix</h3>

In [158]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', smooth_idf=True)
X = vectorizer.fit_transform(df['clean_documents'])
dictionary = vectorizer.get_feature_names()

print(np.array(X.todense(),dtype=np.float16))
dictionary

[[0.3936 0.     0.     0.     0.     0.     0.5874 0.     0.3936 0.
  0.     0.     0.5874]
 [0.3745 0.559  0.     0.     0.     0.     0.     0.     0.3745 0.4512
  0.4512 0.     0.    ]
 [0.4197 0.     0.6265 0.     0.     0.     0.     0.     0.4197 0.5054
  0.     0.     0.    ]
 [0.     0.     0.     0.659  0.     0.     0.     0.     0.     0.
  0.5317 0.5317 0.    ]
 [0.     0.     0.     0.     0.5234 0.5234 0.     0.5234 0.     0.
  0.     0.4224 0.    ]]




['algebra',
 'basis',
 'branch',
 'easy',
 'high',
 'language',
 'lesson',
 'level',
 'linear',
 'mathematics',
 'programming',
 'python',
 'useful']

<h3>Singular Value Decomposition</h3>

In [159]:
from sklearn.decomposition import TruncatedSVD
# SVD represent documents and terms in vectors 
svd_model = TruncatedSVD(n_components=2, algorithm='randomized', n_iter=100, random_state=122)
lsa = svd_model.fit_transform(X)



<h3>Check the Output</h3>

In [160]:
pd.options.display.float_format = '{:,.2f}'.format
topic_encoded_df = pd.DataFrame(lsa, columns = ["topic_1", "topic_2"])
topic_encoded_df["documents"] = df['documents']
topic_encoded_df[["documents", "topic_1", "topic_2"]]



Unnamed: 0,documents,topic_1,topic_2
0,Linear algebra is a useful lesson.,0.63,-0.22
1,Linear algebra and mathematics is basis of pro...,0.84,0.05
2,Linear algebra is the branch of mathematics.,0.81,-0.2
3,Programming Python is easy!,0.27,0.77
4,Python is a high-level language.,0.07,0.74
