# Data Preparation

In [33]:
import pandas as pd

### Mahabharata

In [34]:
path = "./Vedanta_Datasets/Mahabharata_English/Book "
df_mahabharata = []

for i in range(1, 19):
    if i == 12:
        for part in range(1, 4):
            file_path = f"{path}{i} - Part {part}.csv"
            try:
                mahabharata_df = pd.read_csv(file_path, encoding='latin1')
                df_mahabharata.append(mahabharata_df)
            except UnicodeDecodeError:
                print(f"Error reading file {file_path}. Skipping this file.")
    elif i == 13:
        for part in range(1, 3):
            file_path = f"{path}{i} - Part {part}.csv"
            try:
                mahabharata_df = pd.read_csv(file_path, encoding='latin1')
                df_mahabharata.append(mahabharata_df)
            except UnicodeDecodeError:
                print(f"Error reading file {file_path}. Skipping this file.")
    else:
        file_path = f"{path}{i}.csv"
        try:
            mahabharata_df = pd.read_csv(file_path, encoding='latin1')
            df_mahabharata.append(mahabharata_df)
        except UnicodeDecodeError:
            print(f"Error reading file {file_path}. Skipping this file.")

# Concatenate all DataFrames into a single DataFrame
mahabharata_combined = pd.concat(df_mahabharata, ignore_index=True)

In [35]:
mahabharata_combined.shape

(11441, 6)

In [36]:
mahabharata_text = mahabharata_combined["Paragraph Text"]
mahabharata_text.tail()

11436    "I have thus indicated the ordinances, O forem...
11437    "In the Vedas, in the Ramayana, and in the sac...
11438    "I have thus, O chief of men, told everything ...
11439                  The End of the Svargarohanika-parva
11440    The Eighteen parvas of the Mahabharata are thu...
Name: Paragraph Text, dtype: object

In [37]:
mahabharata_text.shape #This is the final text of Mahabharata 

(11441,)

### Moder Books (Yogananda)

In [38]:
yogananda = pd.read_csv("./Vedanta_Datasets/Modern Books/Yogananda_AoY.csv")
yogananda.head()


Unnamed: 0,author,volume,topic,subtopic,url,text
0,Autobigraphy of a Yogi by Yogananda,1,My Parents and Early Life,My Parents and Early Life,https://www.gutenberg.org/cache/epub/7452/pg74...,MY PARENTS AND EARLY LIFE The characteristi...
1,Autobigraphy of a Yogi by Yogananda,1,My Parents and Early Life,My Parents and Early Life,https://www.gutenberg.org/cache/epub/7452/pg74...,The helpless humiliations of infancy are not b...
2,Autobigraphy of a Yogi by Yogananda,1,My Parents and Early Life,My Parents and Early Life,https://www.gutenberg.org/cache/epub/7452/pg74...,My far-reaching memories are not unique. Many ...
3,Autobigraphy of a Yogi by Yogananda,1,My Parents and Early Life,My Parents and Early Life,https://www.gutenberg.org/cache/epub/7452/pg74...,I was born in the last decade of the nineteent...
4,Autobigraphy of a Yogi by Yogananda,1,My Parents and Early Life,My Parents and Early Life,https://www.gutenberg.org/cache/epub/7452/pg74...,In Mother’s presence we tasted our earliest bi...


In [39]:
yogananda.shape

(1191, 6)

In [40]:
yogananda = yogananda["text"]
yogananda.head()

0     MY PARENTS AND EARLY LIFE   The characteristi...
1    The helpless humiliations of infancy are not b...
2    My far-reaching memories are not unique. Many ...
3    I was born in the last decade of the nineteent...
4    In Mother’s presence we tasted our earliest bi...
Name: text, dtype: object

In [41]:
yogananda.shape

(1191,)

### Patanjali

In [42]:
patanjali = pd.read_csv("./Vedanta_Datasets/Patanjali_Yoga_Sutras/Patanjali_Yoga_Sutras_Verses_English.csv")
patanjali.head()

Unnamed: 0,Chapter,Verse,Sanskrit,Word Meanings,Translation
0,1,1,अथ योगानुशासनम्,"अथ = now; योग = process of yoking, union; अनुश...","Now, the teachings of yoga are presented since..."
1,1,2,योगश्चित्तवृत्तिनिरोधः,"योग = process of yoking, union; चित्त = cons...",The purpose of yoga is to quieten the fluctuat...
2,1,3,तदा द्रष्टुः स्वरूपेऽवस्थानम्,"तदा = then; द्रष्टु = seer, witness, pure aw...","Once the state of Yoga is achieved, pure consc..."
3,1,4,वृत्तिसारूप्यमितरत्र,"वृत्ति = patterning, turnings, movements; सारू...","Till the state of Yoga is achieved, consciousn..."
4,1,5,वृत्तयः पञ्चतय्यः क्लिष्टाक्लिष्टाः,"वृत्तयः = patterning, turnings, movements; पञ्...",These mental patterns can be classified into f...


In [43]:
patanjali.shape

(195, 5)

In [44]:
patanjali = patanjali["Translation "]
patanjali.head()

0    Now, the teachings of yoga are presented since...
1    The purpose of yoga is to quieten the fluctuat...
2    Once the state of Yoga is achieved, pure consc...
3    Till the state of Yoga is achieved, consciousn...
4    These mental patterns can be classified into f...
Name: Translation , dtype: object

In [45]:
patanjali.shape #Final text dataset for Patanjali

(195,)

In [46]:
folders_done_3 = pd.concat([mahabharata_text, yogananda, patanjali,], ignore_index=True)

# Print the combined DataFrame
folders_done_3.shape

(12827,)

### Upanishads

In [47]:
Upanishad_Katha = pd.read_csv("./Vedanta_Datasets/Upanishads/Upanishad_Katha.csv")
Upanishad_Mandukya = pd.read_csv("./Vedanta_Datasets/Upanishads/Upanishad_Mandukya.csv")
Upanishad_Isavasya = pd.read_csv("./Vedanta_Datasets/Upanishads/Upanishad_Isavasya.csv")

In [48]:
print(Upanishad_Katha.shape)
print(Upanishad_Mandukya.shape)
print(Upanishad_Isavasya.shape)

(119, 4)
(12, 4)
(18, 4)


In [49]:
print(Upanishad_Katha.columns)
print(Upanishad_Mandukya.columns)
print(Upanishad_Isavasya.columns)

Index(['Chapter/Valli', 'Verse', 'Sanskrit', 'Translation'], dtype='object')
Index(['Chapter', 'Verse', 'Sanskrit', 'Translation'], dtype='object')
Index(['Chapter', 'Verse', 'Sanskrit', 'Translation'], dtype='object')


In [50]:
Upanishad_Katha = Upanishad_Katha.rename(columns= {'Chapter/Valli': 'Chapter'})

In [51]:
print(Upanishad_Katha.columns)
print(Upanishad_Mandukya.columns)
print(Upanishad_Isavasya.columns)

Index(['Chapter', 'Verse', 'Sanskrit', 'Translation'], dtype='object')
Index(['Chapter', 'Verse', 'Sanskrit', 'Translation'], dtype='object')
Index(['Chapter', 'Verse', 'Sanskrit', 'Translation'], dtype='object')


In [52]:
upanishads_list = [Upanishad_Katha, Upanishad_Mandukya, Upanishad_Isavasya]
upanishads_concatenated = pd.concat(upanishads_list, ignore_index=True)
upanishads_concatenated = upanishads_concatenated['Translation']

In [53]:
upanishads_concatenated.shape

(149,)

In [54]:
four_folders_dataframe = pd.concat([folders_done_3, upanishads_concatenated], ignore_index= True)


AttributeError: 'Series' object has no attribute 'columns'

In [None]:
print(four_folders_dataframe.shape)
print('----------------------------------------------------------->')
print(four_folders_dataframe.describe())
print('----------------------------------------------------------->')
print(four_folders_dataframe.head())
print('----------------------------------------------------------->')
print(four_folders_dataframe.tail())

(12976,)
----------------------------------------------------------->
count                          12976
unique                         12741
top       (Sambhava Parva continued)
freq                              76
dtype: object
----------------------------------------------------------->
0    Om! Having bowed down to Narayana and Nara, th...
1    Ugrasrava, the son of Lomaharshana, surnamed S...
2    Accomplished in speech, Sauti, thus questioned...
3    "Sauti said, 'Having heard the diverse sacred ...
4    "The Rishi replied, 'The Purana, first promulg...
dtype: object
----------------------------------------------------------->
12971    He who knows these two, the Unmanifested and t...
12972    The face of truth (Brahman in the solar orb) i...
12973    O thou who art nourisher, the solitary travell...
12974    Let (my) vital force now attain the (all perva...
12975    O Fire! O God! Knowing, as thou do, all our de...
dtype: object


In [None]:
import os
output_dir = 'data'
output_file = 'text.csv'

output_path = os.path.join(output_dir, output_file)
#four_folders_dataframe.to_csv(output_path, index=False)
