In [3]:
import streamlit as st
from io import BytesIO
import docx
import PyPDF2
import openai
import os
import re
import nltk
from nltk.tokenize import sent_tokenize

In [4]:
# Ensure NLTK data is downloaded
nltk.download('punkt')

# Function to extract text from DOCX
def extract_text_docx(file):
    doc = docx.Document(file)
    return "\n".join([para.text for para in doc.paragraphs])

# Function to extract text from PDF
def extract_text_pdf(file):
    reader = PyPDF2.PdfReader(file)
    text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text + "\n"
    return text

# Function to split text into clauses based on common legal numbering
def split_into_clauses(text):
    # This regex matches common clause numbering patterns like "1.", "1.1", "1.1.1", etc.
    clause_pattern = re.compile(r'(\n\d+(\.\d+)*\s)')
    clauses = clause_pattern.split(text)
    
    # Combine the numbering with the clause text
    combined_clauses = []
    for i in range(1, len(clauses), 3):
        number = clauses[i].strip()
        clause_text = clauses[i+2].strip() if (i+2) < len(clauses) else ""
        combined_clauses.append(f"{number} {clause_text}")
    
    # If no numbering is found, split by paragraphs
    if not combined_clauses:
        combined_clauses = [para.strip() for para in text.split('\n') if para.strip()]
    
    return combined_clauses

# Function to call OpenAI API for semantic comparison
def analyze_clause(msa_clause, vendor_clause):
    openai_api_key = os.getenv('OPENAI_API_KEY')
    if not openai_api_key:
        return "OpenAI API key not found. Please set the OPENAI_API_KEY environment variable."
    openai.api_key = openai_api_key

    prompt = (
        "You are an experienced attorney with expertise in contract law. "
        "Analyze the following two contract clauses and identify any discrepancies or deviations. "
        "Explain the potential legal implications of these discrepancies.\n\n"
        f"Master Service Agreement (MSA) Clause:\n{msa_clause}\n\n"
        f"Vendor Contract Clause:\n{vendor_clause}\n\n"
        "Analysis:"
    )
    try:
        response = openai.Completion.create(
            engine="text-davinci-004",  # Use the latest model available
            prompt=prompt,
            max_tokens=300,
            n=1,
            stop=None,
            temperature=0.2,
        )
        analysis = response.choices[0].text.strip()
        return analysis
    except Exception as e:
        return f"Error generating analysis: {str(e)}"


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/abhishekshah/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# Streamlit App
def main():
    st.set_page_config(page_title="Semantic Contract Comparator with AI", layout="wide")
    st.title("Master Service Agreement (MSA) vs. Vendor Contract Comparator with Semantic Analysis")
    
    st.markdown("""
    **Instructions**:
    1. Upload your Master Service Agreement (MSA) and the Vendor's Contract in PDF or DOCX formats.
    2. The application will segment the documents into clauses and perform a semantic comparison.
    3. Discrepancies will be highlighted with AI-generated explanations of potential legal implications.
    """)
    
    msa_file = st.file_uploader("Upload Master Service Agreement (MSA)", type=["pdf", "docx"])
    vendor_file = st.file_uploader("Upload Vendor Contract", type=["pdf", "docx"])
    
    if msa_file and vendor_file:
        with st.spinner('Processing documents and analyzing clauses...'):
            # Extract text from MSA
            if msa_file.type == "application/pdf":
                msa_text = extract_text_pdf(msa_file)
            elif msa_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
                msa_text = extract_text_docx(msa_file)
            else:
                st.error("Unsupported MSA file format.")
                return

            # Extract text from Vendor Contract
            if vendor_file.type == "application/pdf":
                vendor_text = extract_text_pdf(vendor_file)
            elif vendor_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
                vendor_text = extract_text_docx(vendor_file)
            else:
                st.error("Unsupported Vendor Contract file format.")
                return

            # Split documents into clauses
            msa_clauses = split_into_clauses(msa_text)
            vendor_clauses = split_into_clauses(vendor_text)

            # For demonstration, assume clauses are aligned by numbering
            # In practice, more sophisticated alignment may be necessary
            max_clauses = max(len(msa_clauses), len(vendor_clauses))
            discrepancies = []

            for i in range(max_clauses):
                msa_clause = msa_clauses[i] if i < len(msa_clauses) else ""
                vendor_clause = vendor_clauses[i] if i < len(vendor_clauses) else ""

                if msa_clause and vendor_clause:
                    # Compare clauses semantically using OpenAI
                    analysis = analyze_clause(msa_clause, vendor_clause)
                    discrepancies.append({
                        "Clause Number": f"Clause {i+1}",
                        "MSA Clause": msa_clause,
                        "Vendor Clause": vendor_clause,
                        "Analysis": analysis
                    })
                elif msa_clause and not vendor_clause:
                    discrepancies.append({
                        "Clause Number": f"Clause {i+1}",
                        "MSA Clause": msa_clause,
                        "Vendor Clause": "Missing in Vendor Contract",
                        "Analysis": "The vendor contract is missing this clause. This could lead to potential gaps in obligations and rights."
                    })
                elif vendor_clause and not msa_clause:
                    discrepancies.append({
                        "Clause Number": f"Clause {i+1}",
                        "MSA Clause": "Missing in MSA",
                        "Vendor Clause": vendor_clause,
                        "Analysis": "The MSA is missing this clause. This could introduce unexpected obligations or rights."
                    })

        st.success('Semantic Comparison Complete!')

        # Display Results
        st.header("Comparison Results")

        # Create three columns: MSA, Vendor Contract, and AI Analysis
        col1, col2, col3 = st.columns([2, 2, 5])

        with col1:
            st.subheader("Master Service Agreement (MSA) Clause")
        with col2:
            st.subheader("Vendor Contract Clause")
        with col3:
            st.subheader("Analysis & Discrepancies")

        for discrepancy in discrepancies:
            with col1:
                st.markdown(f"**{discrepancy['Clause Number']}**\n\n{discrepancy['MSA Clause']}")
            with col2:
                st.markdown(f"**{discrepancy['Clause Number']}**\n\n{discrepancy['Vendor Clause']}")
            with col3:
                st.markdown(f"**{discrepancy['Analysis']}**")
                st.markdown("---")

        # Optional: Download the comparison report
        def generate_html_report(discrepancies):
            html = """
            <html>
            <head>
                <style>
                    table {width: 100%; border-collapse: collapse;}
                    th, td {border: 1px solid #ddd; padding: 8px;}
                    th {background-color: #f2f2f2;}
                    s {color: red;}
                </style>
            </head>
            <body>
                <h2>Contract Comparison Report</h2>
                <table>
                    <tr>
                        <th>Clause Number</th>
                        <th>MSA Clause</th>
                        <th>Vendor Contract Clause</th>
                        <th>Analysis & Discrepancies</th>
                    </tr>
            """
            for d in discrepancies:
                html += f"""
                    <tr>
                        <td>{d['Clause Number']}</td>
                        <td>{d['MSA Clause']}</td>
                        <td>{d['Vendor Clause']}</td>
                        <td>{d['Analysis']}</td>
                    </tr>
                """
            html += """
                </table>
            </body>
            </html>
            """
            return html

        if st.button("Download Comparison Report"):
            html_report = generate_html_report(discrepancies)
            st.download_button(
                label="Download HTML Report",
                data=html_report,
                file_name="comparison_report.html",
                mime="text/html"
            )

if __name__ == "__main__":
    main()