**HAVING KEYBERT WITH REGULAR EXTRACTION**


For 'First API' extracting following - query, startDate, endDate, Range, typeOfWork

In [1]:
pip install keybert



In [4]:
from keybert import KeyBERT
import re
import dateutil.parser
import spacy
import json

# Sample text
doc = """CCTV installation having Goods and Services from 2023-03-10 to 2023-04-20 cost from $5000 to $7000 """

# Load spaCy's English language model
nlp = spacy.load("en_core_web_sm")

# Initialize KeyBERT model
kw_model = KeyBERT()

# Extract keywords
keywords = kw_model.extract_keywords(doc, keyphrase_ngram_range=(1, 2), stop_words=None)

# Initialize lists to store extracted dates and prices
dates = []
prices = []

# Regular expressions for extracting date patterns (yyyy-mm-dd), codes, and prices
date_pattern = r'\d{4}-\d{2}-\d{2}'
price_pattern = r'\$\d+(?:,\d{3})*(?:\.\d{2})?'  # Matches currency values (e.g., $5,000.00 or $5000)

# Filter keywords based on length and exclusion patterns (dates and prices)
filtered_keywords = [keyword for keyword, _ in keywords if len(keyword.split()) > 1 and not (re.search(date_pattern, keyword) or re.search(price_pattern, keyword))]

# Extract dates using dateutil
date_strings = re.findall(date_pattern, doc)

# Assuming the first date in the list is the start date and the second date is the end date
if len(date_strings) >= 2:
    start_date = dateutil.parser.parse(date_strings[0]).strftime('%Y-%m-%d')
    end_date = dateutil.parser.parse(date_strings[1]).strftime('%Y-%m-%d')

# Extract prices using regular expressions
price_strings = re.findall(price_pattern, doc)
# Remove '$' from prices and store them in the 'prices' list
for price_str in price_strings:
    price = int(price_str.replace('$', '').replace(',', ''))
    prices.append(price)

# Convert the text to lowercase for case-insensitive matching
doc_lower = doc.lower()

# Initialize type of work as an empty string
work_type = ""

# Check if "Goods and Services" appears in the user input
if "goods and services" in doc_lower:
    work_type = "Goods and Services"

# Check if "Works" appears in the user input
elif "works" in doc_lower:
    work_type = "Works"

# Create a dictionary to store the extracted information
result_dict = {
    "query": filtered_keywords,
    "startDate": start_date,
    "endDate": end_date,
    "Range": prices,
    "typeOfWork": work_type
}

# Convert the dictionary to a JSON string
result_json = json.dumps(result_dict, indent=4)

# Print the JSON string
print(result_json)

{
    "query": [
        "cctv installation",
        "cost from",
        "20 cost"
    ],
    "startDate": "2023-03-10",
    "endDate": "2023-04-20",
    "Range": [
        5000,
        7000
    ],
    "typeOfWork": "Goods and Services"
}
