In [None]:
pip install nltk

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

# dl NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
df = pd.read_csv('updated_quran_translation.csv')

stop_words = set(stopwords.words('english'))  # stopwords list
stemmer = PorterStemmer()  # For stemming
lemmatizer = WordNetLemmatizer()  # for lemmatization

def clean_text(text):
    # remove special characters, punctuation, and unwanted symbols
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # keep only letters and spaces

    # convert text to lowercase
    text = text.lower()

    # remove stopwords
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    
    return ' '.join(words)

def stem_text(text):
    words = word_tokenize(text)
    words = [stemmer.stem(word) for word in words]
    return ' '.join(words)

def lemmatize_text(text):
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)


df['Cleaned Text'] = df['Verse Translation'].apply(lambda x: clean_text(str(x)))


df['Stemmed Text'] = df['Cleaned Text'].apply(lambda x: stem_text(str(x)))  # stemming
df['Lemmatized Text'] = df['Cleaned Text'].apply(lambda x: lemmatize_text(str(x)))  # lemmatization

df.to_csv('processed_quran_translation.csv', index=False)

print("Text cleaning, tokenization, and stemming/lemmatization completed successfully.")
