Using "glob" module to get the file names in a list from .txt and .docx files​

For each file, removed punctuation and stop words​

Produced a single .dat file containing the name of each file in quotes, a colon, then a list of words separated by commas​. The list of words per file should be unique for that file. Do not include URLs or phone numbers. Words should be made lowercase. 

Example output:

"File 1.txt" : word1, word2, word3, word7​ "name of file.docx" : word8, word2, word1, word10​ "another file.doc" : word1, word12, word6​

In [80]:
import glob
import os
import string
from nltk.corpus import stopwords #access to stopwords dataset in the NLTK'
from docx import Document 

#download stopwords if not there 
#https://stackoverflow.com/questions/41610543/corpora-stopwords-not-found-when-import-nltk-library 
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aminaelashry/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [81]:
#folder containing the files
files = "./week_10_txt_and_docx"

#get all text file names using glob
file_names = glob.glob(os.path.join(files, "*.txt")) +  glob.glob(os.path.join(files, "*.docx")) #https://www.geeksforgeeks.org/python-os-path-join-method/ 
print(file_names)

['./week_10_txt_and_docx/random_text.txt', './week_10_txt_and_docx/how_rubber_goods_are_made.txt', './week_10_txt_and_docx/52256-0.txt', './week_10_txt_and_docx/pg43994.txt', './week_10_txt_and_docx/most_boring_part2.txt', './week_10_txt_and_docx/blind_text.txt', './week_10_txt_and_docx/pg14895.txt', './week_10_txt_and_docx/53031-0.txt', './week_10_txt_and_docx/58108-0.txt', './week_10_txt_and_docx/pg12814.txt', './week_10_txt_and_docx/smiley_the_bunny.txt', './week_10_txt_and_docx/most_boring_ever.txt', './week_10_txt_and_docx/dr_yawn.txt', './week_10_txt_and_docx/week_10_document1.docx', './week_10_txt_and_docx/week_10_document2.docx']


In [82]:
#define stop words in English and punctuation removal

#creates a set containing all English stop words from NLTK 
stop_words = set(stopwords.words('english'))

#creates a translation table that maps all punctuation characters to None
punctuation_table = str.maketrans('', '', string.punctuation)

In [83]:
#https://stackoverflow.com/questions/25228106/how-to-extract-text-from-an-existing-docx-file-using-python-docx
#function to read content from .docx files 
def read_docx(file_path):
    doc = Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])

In [84]:
#function to clean text (remove punctuation, stopwords, URLs, phone numbers, and convert to lowercase)
#https://stackoverflow.com/questions/34293875/how-to-remove-punctuation-marks-from-a-string-in-python-3-x-using-translate 
#https://stackoverflow.com/questions/11331982/how-to-remove-any-url-within-a-string-in-python
#https://stackoverflow.com/questions/18082130/python-regex-to-remove-all-words-which-contains-number 
#https://www.geeksforgeeks.org/removing-stop-words-nltk-python/ 

def clean_text(text):
    #remove punctuation
    text = text.translate(punctuation_table)
    
    #remove URLs
    text = ' '.join([word for word in text.split() if not word.startswith('http')])
    
    #remove phone numbers (simple regex pattern)
    text = ' '.join([word for word in text.split() if not word.isdigit() and not (word.startswith('+') and len(word) > 1)])
    
    #convert to lowercase and remove stopwords
    words = [word.lower() for word in text.split() if word.lower() not in stop_words]

    return list(set(words))

In [85]:
#dictionary to store results for each file
file_word_dict = {}

#processing each .txt and docx file
for file_path in file_names:
    # Read the file content based on the file extension
    if file_path.endswith(".txt"):
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
            text = file.read()
    elif file_path.endswith(".docx"):
        text = read_docx(file_path)
    else:
        continue  # Skip any non-txt/docx files, if found

    #clean the text and store results
    cleaned_words = clean_text(text)
    file_name = os.path.basename(file_path)
    file_word_dict[file_name] = cleaned_words

print(file_word_dict)



In [86]:
#store the output in .dat format
output_lines = []
for file_name, words in file_word_dict.items():
    line = f'"{file_name}" : {", ".join(words)}'
    output_lines.append(line)

print(output_lines)



In [87]:
#https://stackoverflow.com/questions/69989331/how-to-write-in-a-dat-file-in-python
#write output to a .dat file
output_file_path = "./cleaned_words_output.dat"
with open(output_file_path, 'w', encoding='utf-8') as output_file:
    output_file.write('\n'.join(output_lines))

output_file_path

'./cleaned_words_output.dat'