In [19]:
import re
import spacy

# Load spacy model
nlp = spacy.load("en_core_web_sm")

def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    
    # Process text with spacy
    doc = nlp(text)
    
    # Extract sentences
    sentences = [sent.text.strip() for sent in doc.sents if len(sent.text.strip()) > 1]
    
    # Join sentences to form the cleaned text
    cleaned_text = ' '.join(sentences)
    return cleaned_text

# Define the file path
file_path = 'Bone_marrow.txt'

# Read the content of the file
try:
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    print("File read successfully.")
except FileNotFoundError:
    print("File not found. Please check the file path.")
    text = ""

# Clean text
cleaned_text = clean_text(text)

# Output 
print("Cleaned Text:")
print(cleaned_text)


File read successfully.
Cleaned Text:
Bone marrow
A section of bone marrow tissue
(Prussian blue-stained)
Details
System Hematopoietic system,
immune system,[1] lymphatic
system
Identifiers
Latin
medulla ossium
MeSH
D001853 (
nih.gov/record/ui?ui=D00185
3)
TA98
A13.1.01.001 (
r.ch/Public/EntryPage/TA98%
20Tree/Entity%20TA98%20E
N/13.1.01.001%20Entity%20T
A98%20EN.htm)
TA2
388 (
atomy.org/?id=388)
FMA
9608 (
logy.org/ontologies/FMA/?p=cl
asses&conceptid=
F%2Fpurl.org%2Fsig%2Fon
t%2Ffma%2Ffma9608)
Anatomical terminology
Bone marrow
Bone marrow is a semi-solid tissue found within the spongy
(also known as cancellous) portions of bones.[2] In birds and
mammals, bone marrow is the primary site of new blood cell
production 
(or 
haematopoiesis).[3] 
It 
is 
composed 
of
hematopoietic cells, marrow adipose tissue, and supportive
stromal cells. In adult humans, bone marrow is primarily
located in the ribs, vertebrae, sternum, and bones of the
pelvis.[4] Bone marrow comprises approximately 5% o

In [21]:
import nltk
from nltk.stem import PorterStemmer
from string import punctuation

stemmer = PorterStemmer()

def clean_and_stem_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    
    # Process text with spacy
    doc = nlp(text)
    
    # Extract sentences
    sentences = [sent.text.strip() for sent in doc.sents if len(sent.text.strip()) > 1]
    
    # Stem each word
    stemmed_sentences = []
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        # Remove punctuation and tokens containing numbers
        words = [word for word in words if word.isalnum() and not any(char.isdigit() for char in word)]
        stemmed_words = [stemmer.stem(word) for word in words]
        stemmed_sentences.append(' '.join(stemmed_words))
    
    # Join sentences 
    cleaned_stemmed_text = ' '.join(stemmed_sentences)
    return cleaned_stemmed_text

cleaned_stemmed_text = clean_and_stem_text(cleaned_text)

print("Cleaned/Stemmed Text:")
print(cleaned_stemmed_text)

Cleaned/Stemmed Text:
bone marrow a section of bone marrow tissu prussian detail system hematopoiet system immun system lymphat system identifi latin medulla ossium mesh fma ass f t anatom terminolog bone marrow bone marrow is a tissu found within the spongi also known as cancel portion of bone in bird and mammal bone marrow is the primari site of new blood cell product or haematopoiesi it is compos of hematopoiet cell marrow adipos tissu and support stromal cell in adult human bone marrow is primarili locat in the rib vertebra sternum and bone of the pelvi bone marrow compris approxim of total bodi mass in healthi adult human such that a man weigh kg lb will have around kg lb of bone marrow human marrow produc approxim billion blood cell per day which join the system circul via permeabl vasculatur sinusoid within the medullari caviti all type of hematopoiet cell includ both myeloid and lymphoid lineag are creat in bone marrow howev lymphoid cell must migrat to other lymphoid organ thy

In [None]:
# Define the output file path (optional)
# output_file_path = 'cleaned_word.txt'

# Save the cleaned text to the file (optional)
# with open(output_file_path, 'w', encoding='utf-8') as file:
#     file.write(cleaned_text)