In [1]:
!pip install accelerate
import accelerate
!pip install sentencepiece
import sentencepiece
!pip install transformers
import transformers
!pip install torch
import torch

Collecting accelerate
  Downloading accelerate-0.24.0-py3-none-any.whl (260 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/261.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m122.9/261.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.0/261.0 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub (from accelerate)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: huggingface-hub, accelerate
Successfully installed accelerate-0.24.0 huggingface-hub-0.18.0
Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [

In [2]:
from transformers import DistilBertTokenizer, DistilBertModel
import re
import dateutil.parser
import spacy

# Sample text
doc = """
    CCTV installation cctv  having Goods and Services from 2023-03-10 to 2023-04-20 cost from $5000 to $7000"""

# Load spaCy's English language model
nlp = spacy.load("en_core_web_sm")

# Load DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

# Tokenize and analyze the text using DistilBERT
inputs = tokenizer(doc, return_tensors='pt', padding=True, truncation=True, max_length=512)
with torch.no_grad():
    outputs = model(**inputs)

# Get the hidden states from DistilBERT
hidden_states = outputs.last_hidden_state

# Compute the mean of the hidden states for each token to get keyword scores
keyword_scores = hidden_states.mean(dim=1).squeeze(0)

# Get the top N keywords (e.g., top 10)
num_keywords = 10
top_keywords = torch.topk(keyword_scores, num_keywords).indices.tolist()

# Extract keywords from the top keyword indices
keywords = [tokenizer.decode(token) for token in top_keywords]

# Initialize lists to store extracted dates and prices
dates = []
prices = []

# Regular expressions for extracting date patterns (yyyy-mm-dd) and prices
date_pattern = r'\d{4}-\d{2}-\d{2}'
price_pattern = r'\$\d+(?:,\d{3})*(?:\.\d{2})?'  # Matches currency values (e.g., $5,000.00 or $5000)

# Extract dates using dateutil
date_strings = re.findall(date_pattern, doc)
for date_str in date_strings:
    date = dateutil.parser.parse(date_str)
    dates.append(date.strftime('%Y-%m-%d'))

# Extract prices using regular expressions
price_strings = re.findall(price_pattern, doc)
prices = [float(price_str.replace('$', '').replace(',', '')) for price_str in price_strings]

# Convert the text to lowercase for case-insensitive matching
doc_lower = doc.lower()

# Initialize type of work as an empty string
work_type = ""

# Check if "Goods and Services" appears in the user input
if "goods and services" in doc_lower:
    work_type = "Goods and Services"

# Check if "Works" appears in the user input
elif "works" in doc_lower:
    work_type = "Works"

# Print the extracted keywords, dates, prices, and type of work
print("Extracted Keywords:")
for kw in keywords:
    print(kw)

print("\nExtracted Dates:")
for date in dates:
    print(date)

print("\nExtracted Prices:")
for price in prices:
    print(price)

print("\nType of Work:", work_type)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Extracted Keywords:
[ u n u s e d 5 4 ]
[ u n u s e d 2 2 0 ]
[ u n u s e d 6 1 5 ]
[ u n u s e d 5 9 9 ]
[ u n u s e d 4 5 ]
[ u n u s e d 6 6 ]
[ u n u s e d 3 4 7 ]
[ u n u s e d 6 1 6 ]
[ u n u s e d 7 1 0 ]
[ u n u s e d 6 5 1 ]

Extracted Dates:
2023-03-10
2023-04-20

Extracted Prices:
5000.0
7000.0

Type of Work: Goods and Services
