TF-IDF_LSA to all texts together. I found that the n-gram configuration 2,3 gave the most interesting results in the topics.

In [1]:
import os
import nltk
import pandas as pd

from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD


In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Zipa669\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Zipa669\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Zipa669\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [4]:
# Function originally from: https://www.programcreek.com/python/?CodeExample=get%20wordnet%20pos
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def load_text_documents(folder_path):
    document_texts = []
    document_labels = []

    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".txt"):
                with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
                    text = f.read()
                text_without_caps = " ".join([word for word in text.split() if not word.isupper()])
                lemmatizer = WordNetLemmatizer()
                # Apply lemmatizer to each word in the nursery rhyme
                lemmitized_text = " ".join([lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in text.split()])
                document_texts.append(lemmitized_text)
                document_labels.append(os.path.basename(file[:-4]))
    
    return document_texts, document_labels

In [1]:
import os
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import nltk

def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def lemmatize_and_save_to_csv(input_file_path, output_folder_path):
    with open(input_file_path, 'r', encoding='utf-8') as f:
        text = f.read()

    text_without_caps = " ".join([word for word in text.split() if not word.isupper()])
    lemmatizer = WordNetLemmatizer()
    lemmitized_text = " ".join([lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in text_without_caps.split()])

    
    df = pd.DataFrame({'Lemmatized_Text': [lemmitized_text]})

    
    base_name = os.path.basename(input_file_path)
    file_name_without_extension = os.path.splitext(base_name)[0]

   
    output_csv_path = os.path.join(output_folder_path, f"{file_name_without_extension}.csv")

  
    df.to_csv(output_csv_path, index=False)


input_folder_path = '../Examples_from_class/cleaned_texts_'
output_folder_path = '../Examples_from_class/csv_files/'

for root, _, files in os.walk(input_folder_path):
    for file in files:
        if file.endswith(".txt"):
            input_file_path = os.path.join(root, file)
            lemmatize_and_save_to_csv(input_file_path, output_folder_path)


KeyboardInterrupt: 

In [5]:
folder_path = "../Examples_from_class/cleaned_texts_"
document_texts, document_labels = load_text_documents(folder_path)
print(f'loaded {len(document_labels)} documents')

loaded 8 documents


In [6]:
print(f'The first document is {document_labels[0]}, which goes:')
print(document_texts[0])


The first document is 1. King_James_bible, which goes:
book genesis in the begin god create the heaven and the earth and the earth be without form and void and darkness be upon the face of the deep and the spirit of god move upon the face of the water and god say let there be light and there be light and god saw the light that it be good and god divide the light from the darkness and god call the light day and the darkness he call night and the even and the morning be the first day and god say let there be firmament in the midst of the water and let it divide the water from the water and god make the firmament and divide the water which be under the firmament from the water which be above the firmament and it be so and god call the firmament heaven and the even and the morning be the second day and god say let the water under the heaven be gather together unto one place and let the dry land appear and it be so and god call the dry land earth and the gathering together of the water call

In [7]:
for i in range(8):  
    token_len = len(document_texts[i])
    print(f"Length of document {i}: {token_len} tokens")


Length of document 0: 3857032 tokens
Length of document 1: 779064 tokens
Length of document 2: 105294 tokens
Length of document 3: 3184557 tokens
Length of document 4: 3977590 tokens
Length of document 5: 2926839 tokens
Length of document 6: 585909 tokens
Length of document 7: 933684 tokens


Now lets use the `TfidfVectorizer` class to get our TF-IDF features for each document:

<a id='vectorizer'></a>

In [50]:



english_stop_words = stopwords.words('english')
religious_stop_words = ["hath", "ye", "thy", "lo", "thou", "thus", "name"]
stop_words = english_stop_words + religious_stop_words

In [61]:
vectorizer = TfidfVectorizer(stop_words=stop_words, ngram_range=(2,3))
tf_idf = vectorizer.fit_transform(document_texts)
vocab = vectorizer.get_feature_names_out()
print(f'Our bag of words is a matrix of the shape and size {tf_idf.shape}')

Our bag of words is a matrix of the shape and size (8, 1789096)


In [67]:
tfidf_df = pd.DataFrame(tf_idf.toarray(), columns=vocab, index=document_labels)
tfidf_df

Unnamed: 0,aa aeb,aa aeb erat,aa conateats,aa conateats page,...,ẓá call,ẓá call unto,馮家福 feng,馮家福 feng jia
1. King_James_bible,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0
2. English-Quran-plain-text,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0
3. Bhagavad_Gita,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0
4. Tipitaka,0.000359,0.000359,0.000359,0.000359,...,0.0,0.0,0.0,0.0
5. Complete Jewish Bible,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0
6. Writings_of_Bahaullah,0.0,0.0,0.0,0.0,...,0.000586,0.000586,0.0,0.0
8. tao_te_ching,0.0,0.0,0.0,0.0,...,0.0,0.0,0.002058,0.002058
9. Kojiki_Japan_,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0


In [68]:
single_row_df = tfidf_df.iloc[0]
single_row_df = single_row_df.replace(0.0,None)
single_row_df = single_row_df.dropna()
single_row_df

aaron also                  0.000686
aaron also christ           0.000343
aaron also time             0.000343
aaron among                 0.000686
aaron among priest          0.000343
                              ...   
zurishaddai prince child    0.000343
zurishaddai sixth           0.000343
zurishaddai sixth day       0.000343
zuzims ham                  0.000343
zuzims ham emins            0.000343
Name: 1. King_James_bible, Length: 467014, dtype: object

In [69]:
tfidf_df = tfidf_df - tfidf_df.mean()

In [70]:
num_topics = 8
pd.options.display.max_columns=num_topics #Make sure we display them all
labels = ['topic{}'.format(i) for i in range(num_topics)] 

In [71]:
svd = TruncatedSVD(n_components = num_topics, n_iter = 500) #You can change n_iter: Higher numbers will take longer but may (or may not) give you better results
svd_topic_vectors = svd.fit_transform(tfidf_df.values)

In [72]:
topic_weights = pd.DataFrame(svd.components_.T, index=vocab, columns=labels)
topic_weights.sample(8)

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7
together valuable article,-2.6e-05,-7.6e-05,-1e-05,9.6e-05,0.000212,2e-06,-3.4e-05,-5.476304e-09
become spacious darkness,-0.0001,-6.9e-05,-7.1e-05,0.000225,-0.0002,-3.1e-05,9e-06,6.751149e-08
see exposition,-0.000742,-0.000896,0.001854,-0.000751,-0.000518,0.000222,4.8e-05,-1.010798e-07
take vi im,-0.000129,-0.00038,-4.9e-05,0.00048,0.001062,1.2e-05,-0.000172,-5.634677e-08
jesus answer saith,0.000336,-1e-05,1e-05,5e-06,8e-06,-0.000159,0.000586,-6.990364e-09
po jaring taiping,-0.0001,-6.9e-05,-7.1e-05,0.000225,-0.0002,-3.1e-05,9e-06,6.751149e-08
explicitly make mention,0.000138,3.1e-05,-3e-06,-2.2e-05,-6.4e-05,0.000539,-5.4e-05,-1.970989e-08
however still sacrifice,-2.6e-05,-7.6e-05,-1e-05,9.6e-05,0.000212,2e-06,-3.4e-05,-5.476304e-09


And the most relevent words for each topic:

In [73]:
num_terms = 60
for i in range(num_topics):
    print("___topic " + str(i) + "___")
    topicName = "topic" + str(i)
    weightedlist = topic_weights.get(topicName).sort_values()[-num_terms:]
    print(weightedlist.index.values)

___topic 0___


['lord host' 'shall eat' 'god lord' 'house lord' 'haply may' 'cry unto'
 'burnt offering' 'lord world' 'one another' 'beside allah'
 'believe good work' 'messenger allah' 'allah lord' 'allah forgive'
 'way allah' 'unto people' 'every one' 'unto moses' 'allah knoweth'
 'allah ever' 'allah save' 'word lord' 'every man' 'wrong doer'
 'saith lord god' 'painful doom' 'spake unto' 'keep duty' 'believe allah'
 'moses say' 'ward evil' 'give unto' 'forgive merciful' 'day resurrection'
 'reveal unto' 'turn away' 'unto god' 'lord say' 'duty allah' 'lord shall'
 'unto say' 'shall come' 'thee muhammad' 'bahá lláh' 'say allah' 'qur án'
 'allah messenger' 'say lord' 'heaven earth' 'come pas' 'allah allah'
 'sura al' 'saith lord' 'unto allah' 'child israel' 'unto lord'
 'come unto' 'lord god' 'unto thee' 'say unto']
___topic 1___
['equal heart' 'set freefrom' 'fn sanskrit' 'devilish womb'
 'strive thereto' 'inward breath' 'adhiyajna lord' 'longarmed lord'
 'life within' 'self friend' 'thousand yugas' 

In [None]:
svd_topic_vectors_df = pd.DataFrame(svd_topic_vectors, index=document_labels, columns=labels)
svd_topic_vectors_df.sample(8)

In [49]:
svd_topic_vectors_df.sort_values(by=['topic0'], ascending=False)

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7
6. Writings_of_Bahaullah,0.478647,-0.064647,-0.109365,-0.092132,-0.183442,-0.291702,-0.415482,-1.945281e-20
1. King_James_bible,0.42772,0.056243,0.069526,-0.046566,-0.090733,-0.329413,0.434591,9.804731999999999e-20
2. English-Quran-plain-text,0.3682,0.184425,0.09586,0.340857,0.625345,0.212058,-0.046822,2.0767849999999999e-19
3. Bhagavad_Gita,0.253281,-0.318507,-0.24217,-0.132399,-0.311955,0.510358,0.081064,7.555959e-20
5. Complete Jewish Bible,-0.230018,0.274922,0.706874,-0.202525,-0.217046,0.1298,-0.067139,2.434139e-20
9. Kojiki_Japan_,-0.406269,0.625031,-0.498734,-0.040388,-0.073061,-5.7e-05,0.007126,5.2889459999999996e-20
8. tao_te_ching,-0.43345,-0.41291,-0.058807,-0.471284,0.433957,-0.114951,0.008024,3.957335e-20
4. Tipitaka,-0.458111,-0.344556,0.036816,0.644438,-0.183064,-0.116092,-0.001362,-4.2562089999999995e-19
