In [1]:
import os
import re
import spacy
import nltk
from nltk.stem import PorterStemmer
from string import punctuation
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Load spacy model
nlp = spacy.load("en_core_web_sm")

# Initialize the Porter Stemmer
stemmer = PorterStemmer()

def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'https\S+', '', text)
    text = re.sub(r'External links', '', text)
    
    # Process text with spacy
    doc = nlp(text)
    
    # Extract sentences
    sentences = [sent.text.strip() for sent in doc.sents if len(sent.text.strip()) > 1]
    
    stemmed_sentences = []
    # Join sentences to form the cleaned text
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        # Remove punctuation and tokens containing numbers
        words = [word for word in words if word.isalnum() and not any(char.isdigit() for char in word)]
        stemmed_words = [stemmer.stem(word) for word in words]
        stemmed_sentences.append(' '.join(stemmed_words))
    
    # Join sentences 
    cleaned_stemmed_text = ' '.join(stemmed_sentences)
    
    return cleaned_stemmed_text

# Define the directory you want to traverse
directory = 'Dataset'

# Traverse the directory
for root, dirs, files in os.walk(directory):
    for filename in files:
        if filename.endswith('.txt'):  # For each text file
            file_path = os.path.join(root, filename)
            
            # Read the content of the file
            try:
                with open(file_path, 'r', encoding='utf-8') as file:
                    text = file.read()
                print(f"File read successfully: {file_path}")
            except FileNotFoundError:
                print(f"File not found: {file_path}")
                continue
            
            # Clean and stem text
            cleaned_text = clean_text(text)
            
            # Output cleaned text to the same file (overwrite)
            try:
                with open(file_path, 'w', encoding='utf-8') as file:
                    file.write(cleaned_text)
                print(f"File written successfully: {file_path}")
            except Exception as e:
                print(f"Failed to write to file: {file_path}, Error: {e}")

File read successfully: Dataset\Circulatory_System\Aorta.txt
File written successfully: Dataset\Circulatory_System\Aorta.txt
File read successfully: Dataset\Circulatory_System\Arterial_tree.txt
File written successfully: Dataset\Circulatory_System\Arterial_tree.txt
File read successfully: Dataset\Circulatory_System\Artery.txt
File written successfully: Dataset\Circulatory_System\Artery.txt
File read successfully: Dataset\Circulatory_System\Atrium_(heart).txt
File written successfully: Dataset\Circulatory_System\Atrium_(heart).txt
File read successfully: Dataset\Circulatory_System\Blood.txt
File written successfully: Dataset\Circulatory_System\Blood.txt
File read successfully: Dataset\Circulatory_System\Blood_plasma.txt
File written successfully: Dataset\Circulatory_System\Blood_plasma.txt
File read successfully: Dataset\Circulatory_System\Blood_pressure.txt
File written successfully: Dataset\Circulatory_System\Blood_pressure.txt
File read successfully: Dataset\Circulatory_System\Blood_

In [None]:
# for checking output

import nltk
from nltk.stem import PorterStemmer
from string import punctuation

stemmer = PorterStemmer()

def clean_and_stem_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    
    # Process text with spacy
    doc = nlp(text)
    
    # Extract sentences
    sentences = [sent.text.strip() for sent in doc.sents if len(sent.text.strip()) > 1]
    
    # Stem each word
    stemmed_sentences = []
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        # Remove punctuation and tokens containing numbers
        words = [word for word in words if word.isalnum() and not any(char.isdigit() for char in word)]
        stemmed_words = [stemmer.stem(word) for word in words]
        stemmed_sentences.append(' '.join(stemmed_words))
    
    # Join sentences 
    cleaned_stemmed_text = ' '.join(stemmed_sentences)
    return cleaned_stemmed_text

cleaned_stemmed_text = clean_and_stem_text(cleaned_text)

print("Cleaned/Stemmed Text:")
print(cleaned_stemmed_text)