# Document Parsing and Summarisation

Document parsing is the process of converting a document into a structured format. This structured format can be a Microsoft Word DOCX, a PDF, or any other format that can be easily processed by a computer. Document summarisation is the process of extracting the most important information from a document. This can be done by extracting the most important sentences or paragraphs from the document.

In [None]:
%pip install nltk openai python-dotenv python-docx

In [None]:
from openai import OpenAI
import os
import dotenv

dotenv.load_dotenv()

# Get the API key from the environment
api_key = os.getenv('OPENAI_API_KEY')

openai = OpenAI(api_key=api_key)

In [None]:
import docx

# Function to extract text from a docx file
def get_text_from_docx(filename: str) -> str:
    """
    Extract text from a Word document.

    Args:
    filename (str): Path to the Word document.

    Returns:
    str: The text extracted from the Word document.
    """
    doc = docx.Document(filename)

    full_text = []

    for para in doc.paragraphs:
        full_text.append(para.text)

    return '\n'.join(full_text)

text = get_text_from_docx('Government of Valoria.docx')
print(text)

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from heapq import nlargest

def analyse(text, num_sentences=3):
    paragraphs = text.split('\n')

    sentences = []

    for para in paragraphs:
        sentences.extend(sent_tokenize(para))

    words = word_tokenize(text.lower())

    # Remove stopwords and punctuation
    stop_words = set(stopwords.words('english') + list(punctuation))
    word_freq = {}

    for word in words:
        if word not in stop_words:
            if word not in word_freq:
                word_freq[word] = 1
            else:
                word_freq[word] += 1

    # Calculate sentence scores based on word frequencies
    sentence_scores = {}

    for sentence in sentences:
        for word in word_tokenize(sentence.lower()):
            if word in word_freq:
                if sentence not in sentence_scores:
                    sentence_scores[sentence] = word_freq[word]
                else:
                    sentence_scores[sentence] += word_freq[word]

    return nlargest(num_sentences, sentence_scores, key=sentence_scores.get)

for index, sentence in enumerate(analyse(text)):
    print(f"{index + 1}. {sentence}")

In [None]:
def summarise(text: str) -> str:
    """
    Summarise the input text.

    Args:
    text (str): The text to summarise.

    Returns:
    str: The summarised text.
    """
    try:
        response = openai.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {
                    "role": "system",
                    "content": "You will summarise submitted text into 3 main key points."
                },
                {
                    "role": "user",
                    "content": text
                }
            ],
        )

        latest_message = response.choices[0].message.content
        return latest_message

    except Exception as e:
        return str(e)

print("Summarised text:", summarise(text))