# Milestone 1

### Importing Needed Libraries

In [None]:
import pdfplumber 
import fitz 
import os 
import re 
import json 
import pandas as pd
import nltk 
import matplotlib.pyplot as plt
import seaborn as sns
import nlp
from wordcloud import WordCloud
from collections import Counter
import spacy
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')


### Connecting the code with the files in PDF folder in the repo

In [None]:
pdf_dir = 'PDF files/'
pdf_files = [f for f in os.listdir(pdf_dir) if f.endswith('.pdf')]

### Defining function for Extraction , Preprocessing , tokenizing and removing stop words

In [None]:
#Text Extraction
def extract_text (pdf_path ):
    text = ''
    doc = fitz.open(pdf_path)
    for page in doc:
        text += page.get_text("text") + "\n"
    return text.strip()

#Cleaning and removing noise
def clean_text(text):
    text = re.sub(r'\s+',' ',text) #Removing extra Spaces and new lines
    text = re.sub(r'[^a-zA-Z0-9.?! ]', '', text) # Keep sentence-ending punctuation (., ?, !)
    return text.lower().strip()

#tokenizing into sentences 
def tokenize_text(text):
    return [s.strip() for s in sent_tokenize(text) if s.strip()]

#removing stop words in each sentence
def removing_stopwords(sentences):
    stop_words = set(stopwords.words('english'))
    filtered_sentences = []

    for sentence in sentences:
        words = word_tokenize(sentence)  
        filtered_words = [word for word in words if word.lower() not in stop_words]  
        filtered_sentences.append(' '.join(filtered_words))  

    return filtered_sentences

### Listing all availabe uploaded PDF files and the user will input the number of file he wants 
### Note : will be changed later so the user can upload his own pdf file

In [None]:
print('\nðŸ“‚ Available PDFs:')
for i, pdf in enumerate(pdf_files,1):
    print(f'{i} : {pdf}')
choice = int(input("\nEnter the number of the PDF you want to extract text from: ")) - 1
if 0 <= choice < len(pdf_files):
    PDF_FILE = pdf_files[choice]
    PDF_PATH = os.path.join(pdf_dir, PDF_FILE)
    print(f"\nâœ… Extracting text from: {PDF_FILE}")
    text = extract_text(PDF_PATH)
    print(f'\n {text}')


### Preprocessing the text

In [None]:
cleaned_text = clean_text(text)
tokenized_text = tokenize_text(clean_text)
filtered_text = ' '.join(removing_stopwords(tokenized_text))


### Basic Statistics

In [None]:
words = word_tokenize(filtered_text)

total_words = len(words)
unique_words = len(set(words))
total_sent = len(tokenized_text)
print("\nðŸ“Š Basic Text Statistics:")
print(f"ðŸ”¹ Total Words: {total_words}")
print(f"ðŸ”¹ Unique Words: {unique_words}")
print(f"ðŸ”¹ Total Sentences: {total_sent}")

## Visualization Time

### Most frequent 20 word

In [None]:
words = [word.lower() for word in words if word.isalnum()]  
words_count = Counter(words)
most_common_words = words_count.most_common(20)
plt.figure(figsize=(12, 6))
sns.barplot(x=[word for word, _ in most_common_words], y=[count for _, count in most_common_words])
plt.xticks(rotation=45)
plt.xlabel("Words")
plt.ylabel("Frequency")
plt.title("Top 20 Most Common Words")
plt.show()

### WordCloud

In [None]:
wordcloud = WordCloud(width=800, height=400, background_color="white").generate(filtered_text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Word Cloud")
plt.show()

## Saving extracted text into .json file

In [None]:
def save_to_json(data,filename):
    if not os.path.exists('Extracted text'):
        os.makedirs('Extracted text')
    
    file_path = os.path.join('Extracted text', filename)  
    with open(file_path, "w", encoding="utf-8") as file:
        json.dump(data, file, ensure_ascii=False, indent=4)
    
    print(f"âœ… Filtered text saved as {file_path}")

In [None]:
save_to_json(filtered_text, 'Art of War')