In [1]:
import spacy
import random
from spacy.training import Example
from spacy.training import offsets_to_biluo_tags
import re
import ast
import json
from sklearn.metrics import precision_score, recall_score, f1_score
from fuzzywuzzy import fuzz


# Load the pre-trained SpaCy model
nlp = spacy.load('en_core_web_sm')

# Add new entity labels for legal codes, case names, and court jurisdictions
ner = nlp.get_pipe('ner')
ner.add_label('LAW')
ner.add_label('CASE')
ner.add_label('COURT')


keywords = [
    "religious", 
    "tax-exempt", 
    "IRS", 
    "commercial activity", 
    "Code", 
    "Regulation", 
    "Statute", 
    "Provision", 
    "Section", 
    "Subsection", 
    "Title", 
    "Clause", 
    "Article", 
    "Amendment",
    "Church",
    "Religion",
    "Faith",
    "Worship",
    "Congregation",
    "Minister",
    "Pastor",
    "Synagogue",
    "Mosque",
    "Temple",
    "Deduction",
    "Exemption",
    "Revenue",
    "Income",
    "Taxpayer",
    "Taxable",
    "Tax-exempt",
    "Taxation",
    "IRS Code",
    "Tax shelter"
]


fuzzy_threshold = 80  

def is_near_keyword(sentence):
    for keyword in keywords:
        if fuzz.partial_ratio(keyword, sentence) >= fuzzy_threshold:
            return True
    return False


def read_file(file_path):
    with open(file_path, 'r') as file:
        return file.read()


def calculate_f1_score(correct_citations, predicted_citations):
    tp = fp = fn = 0
    total_positive = sum(correct_citations)

    for i in range(len(correct_citations)):
        if correct_citations[i] == 1 and predicted_citations[i] == 1:
            tp += 1
        elif correct_citations[i] == 0 and predicted_citations[i] == 1:
            fp += 1
        elif correct_citations[i] == 1 and predicted_citations[i] == 0:
            fn += 1

    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0
    f1_score = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0

    return precision, recall, f1_score


def calculate_f1_score_text(correct_citations, predicted_citations):
    tp = fp = fn = 0

    for correct_sent in correct_citations:
        if correct_sent in predicted_citations:
            tp += 1
        else:
            fn += 1

    for predicted_sent in predicted_citations:
        if predicted_sent not in correct_citations:
            fp += 1

    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0
    f1_score = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0

    return precision, recall, f1_score



def find_citations(text):
    citations = []
    
    with open("testcase.txt", "r") as file:
        for line in file:
            line = line.strip()  # Remove leading and trailing whitespace, including the newline character
            if line:  # Skip empty lines
                sentences.append(line)
 
    for sentence in sentences:
        doc = nlp(sentence)
       
        pattern = r"\bv\."
        match = re.search(pattern,sentence)
        entities = [ent.label_ for ent in doc.ents]

        if is_near_keyword(sentence):
            if match:
                if "CASE" in entities and "CODE" in entities:
                    citations.append(sentence)

            else:   
                if "LAW" in entities and "CODE" in entities:
                    citations.append(sentence)

    return citations


def is_citation(sentence):
    citations = []
    
    doc = nlp(sentence)
    
       
    pattern = r"\bv\."
    match = re.search(pattern,sentence)
    entities = [ent.label_ for ent in doc.ents]

    if is_near_keyword(sentence):
        if match:
                if "CASE" in entities and "CODE" in entities:
                    return 1
                else:
                    return 0

        else:   
                if "LAW" in entities and "CODE" in entities:
                    return 1
                else:
                    return 0

train_data = [

    ("IRC § 501 defines religious organizations.", {"entities": [(0, 5, "LAW"),(6,9,"CODE")]},1),
    ("Bob Jones University v. Simon is a landmark case.", {"entities": [(0, 29, "CASE")]},0),
    ("Mormon Church v. United States, 136 U.S. 1 (1890).(Case on religious organization funding)",{"entities":[(0,31,"CASE"),(32,42,"CODE")]},1),
    ("RUNYON v. McCRARY, 427 U.S. 160 96 S. Ct. 258 (1976).",{"entities": [(0, 18, "CASE"),(19,31,"CODE"),(32,45,"COURT")]},0),
    ("SMITH v. JONES, 427 U.S. 160 97 S. Ct. 259 (1977).",{"entities": [(0, 14, "CASE"),(16,28,"CODE"),(29,42,"COURT")]},0),
    ("DOE v. ROE, 427 U.S. 160 98 S. Ct. 260 (1978).",{"entities": [(0, 10, "CASE"),(12,24,"CODE"),(25,38,"COURT")]},0),
    
    ("Hermitage Ministries Inc. v. Commissioner (73 T.C. 1106 (1979)) (Case on religious organizations and fundraising activities)",{"entities":[(0,41,"CASE"),(43,55,"COURT")]},1),
    ("Texas Heart Hospital of St. Luke's Episcopal Health Charities, Inc. v. United States (978 F.2d 280 (5th Cir. 1992)) (Case on religious hospitals and tax exemption)",{"entities":[(0,84,"CASE"),(86,98,"CODE"),(100,108,"COURT")]},1),
    ("Johnson v. Rogers, 120 F.3d 5 (1st Cir. 1997) (Case on tax exemption for religious institutions)",{"entities":[(0,17,"CASE"),(19,29,"CODE"),(31,39,"COURT")]},1),
    ("Evans v. United States, 23 F.3d 304 (4th Cir. 1994) (Case on IRS regulations)",{"entities":[(0,22,"CASE"),(24,35,"CODE"),(37,45,"COURT")]},1),
    ("Smith v. Commissioner, 15 T.C. 1 (1950) (Case on tax deductions for charitable contributions)",{"entities":[(0,21,"CASE"),(23,32,"CODE")]},1),
    ("IRS Revenue Ruling 70-549, 1970-2 C.B. 103 (1970) (Ruling on tax-exempt status of religious organizations)",{"entities":[(0,3,"LAW"),(4,11,"LAW"),(34,42,"CODE")]},1),
    ("IRC § 501(c)(3) (Provision for tax exemption of religious, charitable, and certain other organizations)",{"entities":[(0,5,"LAW"),(6,15,"CODE")]},1),
    ("Fraternal Order of Eagles v. United States, 15 F.3d 1097 (9th Cir. 1994) (Case on tax-exempt status of fraternal organizations)",{"entities":[(0,42,"CASE"),(44,56,"CODE"),(58,66,"COURT")]},1),
    ("John Doe v. Commissioner, 123 T.C. 456 (2004) (Case on anonymity of donors to religious organizations)",{"entities":[(0,24,"CASE"),(26,38,"CODE")]},1),
    ("St. Paul's Lutheran Church v. Commissioner, 82 T.C. 371 (1984) (Case on tax treatment of church property)",{"entities":[(0,42,"CASE"),(44,55,"CODE")]},1),
    ("IRC § 513 (Definition of unrelated business taxable income for tax-exempt organizations)",{"entities":[(0,5,"LAW"),(6,9,"CODE")]},1),
    ("Treasury Regulation § 1.501(c)(3)-1 (Regulation defining requirements for tax-exempt status under IRC § 501(c)(3))",{"entities":[(0,21,"LAW"),(22,35,"CODE")]},1),
    ("Citizens for the Abatement of Aircraft Noise, Inc. v. County of Orange, 126 F.3d 1294 (9th Cir. 1997) (Case on tax exemption for noise abatement organizations)",{"entities":[(0,70,"CASE"),(72,85,"CODE"),(87,95,"COURT")]},1),
    ("IRC § 508 (Provision for exemption from filing annual information returns for certain organizations)",{"entities":[(0,5,"LAW"),(6,9,"CODE")]},1),
    ("IRS Revenue Procedure 2018-5, 2018-1 I.R.B. 233 (2018) (Procedure for applying for reinstatement of tax-exempt status)",{"entities":[(0,3,"LAW"),(4,11,"LAW"),(37,47,"CODE")]},1),
    ("Revenue Ruling 74-162, 1974-1 C.B. 250 (1974) (Ruling on tax treatment of religious proselytizing literature)",{"entities":[(0,7,"LAW"),(30,38,"CODE")]},1),
    ("Church of Hope v. Internal Revenue Service (D.C. Md. 2023).",{"entities":[(0,42,"CASE"),(44,52,"CODE")]},1)
    # Add more annotated examples based on the provided citations
]


# Split the training data into batches of 3
# Split the training data into batches of 3
batch_size = 3
batches = [train_data[i:i+batch_size] for i in range(0, len(train_data), batch_size)]

print("For training data set")
print()
# Iterate over each batch and train the model
epoch_num = 5
for epoch in range(epoch_num):  # Number of epochs
    random.shuffle(batches)
    for batch_num, batch in enumerate(batches, 1):
        sentences = [item[0] for item in batch]
        annotations = [item[1] for item in batch]
        is_citation_labels = [item[2] for item in batch]

        # Predict is_citation for each sentence
        predicted_is_citation = [is_citation(sent) for sent in sentences]

        # Calculate F1 score
        precision, recall, f1 = calculate_f1_score(is_citation_labels, predicted_is_citation)

        # Print epoch and batch numbers before printing precision, recall, and F1-score
        print(f"Epoch: {epoch + 1}, Batch: {batch_num}")
        print(f"Precision: {precision}, Recall: {recall}, F1 Score: {f1}")

        # Update the model for each example in the batch
        for text, annotation in zip(sentences, annotations):
            example = Example.from_dict(nlp.make_doc(text), annotation)
            nlp.update([example])

# Save the fine-tuned model
nlp.to_disk('fine_tuned_model')






sentences = []


        




file_path = 'testcase.txt'  # Update with your file path
text = read_file(file_path)

# Find citations in the text
citations = find_citations(text)

# Print the identified citations


print()

print("Predicted Citations : ")
for citation in citations:
    print(citation)


file_path1 = "corr_output.txt"

# Read the file and extract correct citations
correct_citations = []
with open(file_path1, "r") as file:
    correct_citations = [line.strip() for line in file]


print()
# Assuming correct_citations and predicted_citations are lists of citations
print("Actual Citations : ")
for cit in correct_citations:
 print(cit)



precision, recall, f1_score = calculate_f1_score_text(correct_citations, citations)

print()
print("For test data set")
print()
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)










For training data set

Epoch: 1, Batch: 1
Precision: 0, Recall: 0.0, F1 Score: 0
Epoch: 1, Batch: 2
Precision: 0, Recall: 0.0, F1 Score: 0
Epoch: 1, Batch: 3
Precision: 0, Recall: 0.0, F1 Score: 0
Epoch: 1, Batch: 4
Precision: 0, Recall: 0.0, F1 Score: 0
Epoch: 1, Batch: 5
Precision: 0, Recall: 0.0, F1 Score: 0
Epoch: 1, Batch: 6
Precision: 0, Recall: 0.0, F1 Score: 0
Epoch: 1, Batch: 7
Precision: 0, Recall: 0.0, F1 Score: 0
Epoch: 1, Batch: 8
Precision: 0, Recall: 0, F1 Score: 0
Epoch: 2, Batch: 1
Precision: 0, Recall: 0.0, F1 Score: 0
Epoch: 2, Batch: 2
Precision: 0, Recall: 0.0, F1 Score: 0
Epoch: 2, Batch: 3
Precision: 0, Recall: 0.0, F1 Score: 0
Epoch: 2, Batch: 4
Precision: 0, Recall: 0.0, F1 Score: 0
Epoch: 2, Batch: 5
Precision: 0, Recall: 0.0, F1 Score: 0
Epoch: 2, Batch: 6
Precision: 0, Recall: 0.0, F1 Score: 0
Epoch: 2, Batch: 7
Precision: 0, Recall: 0.0, F1 Score: 0
Epoch: 2, Batch: 8
Precision: 0, Recall: 0, F1 Score: 0
Epoch: 3, Batch: 1
Precision: 1.0, Recall: 0.5, F1 Sc