In [12]:
import spacy
from spacy import displacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Load spaCy model
nlp_model = spacy.load("en_core_web_sm")

def process_text(input_text):
    doc = nlp_model(input_text)
    refined_tokens = [token.text.lower() for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(refined_tokens), doc

# Example text
sample_text = "Rahul wakes up early every day. He goes to college in the morning and comes back at 3 pm. At present, Rahul is outside. He has to buy the snacks for all of us."

# Process text
processed_text, doc_structure = process_text(sample_text)
print("Processed Text:", processed_text)

def extract_tasks(document):
    extracted_tasks = []
    for sentence in document.sents:
        for word in sentence:
            if word.pos_ == "VERB" and word.lemma_ in ["buy", "clean", "review", "schedule"]:
                task_details = {"action": word.lemma_, "assigned_to": None, "due_date": None}
                
                for dependency in word.children:
                    if dependency.dep_ == "nsubj":
                        task_details["assigned_to"] = dependency.text
                    elif dependency.dep_ == "prep" and dependency.text == "by":
                        task_details["due_date"] = next(dependency.rights, None).text if next(dependency.rights, None) else None
                
                extracted_tasks.append(task_details)
    return extracted_tasks

# Extract tasks
tasks_identified = extract_tasks(doc_structure)
print("Extracted Tasks:", tasks_identified)

def classify_tasks(task_list):
    vectorizer = CountVectorizer()
    text_matrix = vectorizer.fit_transform(task_list)
    lda_model = LatentDirichletAllocation(n_components=3, random_state=42)
    lda_model.fit(text_matrix)
    return lda_model.transform(text_matrix)

# Extract task actions
task_descriptions = [task["action"] for task in tasks_identified]

# Classify tasks
task_categories = classify_tasks(task_descriptions)
print("Task Categories:", task_categories)

def format_output(tasks, categories):
    structured_output = []
    for task_item, category_prob in zip(tasks, categories):
        structured_output.append({
            "task": task_item["action"],
            "assigned_to": task_item["assigned_to"],
            "due_date": task_item["due_date"],
            "category": category_prob.argmax()
        })
    return structured_output

# Generate structured output
final_output = format_output(tasks_identified, task_categories)
print("Final Structured Output:", final_output)

# Test with another example
test_input = "John needs to clean the room by 5 pm today. Sarah has to review the report by tomorrow."
processed_text, doc_structure = process_text(test_input)
tasks_identified = extract_tasks(doc_structure)
task_descriptions = [task["action"] for task in tasks_identified]
task_categories = classify_tasks(task_descriptions)
final_output = format_output(tasks_identified, task_categories)
print("Test Case Output:", final_output)


Processed Text: rahul wakes early day goes college morning comes 3 pm present rahul outside buy snacks
Extracted Tasks: [{'action': 'buy', 'assigned_to': None, 'due_date': None}]
Task Categories: [[0.33333333 0.33333333 0.33333333]]
Final Structured Output: [{'task': 'buy', 'assigned_to': None, 'due_date': None, 'category': 0}]
Test Case Output: [{'task': 'clean', 'assigned_to': None, 'due_date': 'pm', 'category': 2}, {'task': 'review', 'assigned_to': None, 'due_date': 'tomorrow', 'category': 1}]


In [9]:
!python -m spacy download en_core_web_sm


Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
     - ------------------------------------- 0.5/12.8 MB 541.6 kB/s eta 0:00:23
     - ------------------------------------- 0.5/12.8 MB 541.6 kB/s eta 0:00:23
     -- ------------------------------------ 0.8/12.8 MB 559.5 kB/s eta 0:00:22
     -- ------------------------------------ 0.8/12.8 MB 559.5 kB/s eta 0:00:22
     -- ------------------------------------ 0.8/12.8 MB 559.5 kB/s eta

In [7]:
pip install seaborn


Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting pandas>=1.2 (from seaborn)
  Downloading pandas-2.2.3-cp310-cp310-win_amd64.whl.metadata (19 kB)
Collecting pytz>=2020.1 (from pandas>=1.2->seaborn)
  Downloading pytz-2025.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas>=1.2->seaborn)
  Downloading tzdata-2025.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
Downloading pandas-2.2.3-cp310-cp310-win_amd64.whl (11.6 MB)
   ---------------------------------------- 0.0/11.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.6 MB ? eta -:--:--
    --------------------------------------- 0.3/11.6 MB ? eta -:--:--
   - -------------------------------------- 0.5/11.6 MB 985.5 kB/s eta 0:00:12
   -- ------------------------------------- 0.8/11.6 MB 1.1 MB/s eta 0:00:11
   --- ------------

In [2]:
pip install spacy


Note: you may need to restart the kernel to use updated packages.Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting spacy
  Downloading spacy-3.8.4-cp310-cp310-win_amd64.whl.metadata (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.12-cp310-cp310-win_amd64.whl.metadata (2.2 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.11-cp310-cp310-win_amd64.whl.metadata (8.8 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.9-cp310-cp310-win_amd64.whl.metadata (2.2 kB)
Collecting thinc<8.4.0,>=8.3.4 (from spacy)
  Downloading thinc-8.3.4-cp310-cp310-win_amd64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Downloading 

In [5]:
pip install nltk


Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.11.6-cp310-cp310-win_amd64.whl.metadata (41 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ------ --------------------------------- 0.3/1.5 MB ? eta -:--:--
   ------------- -------------------------- 0.5/1.5 MB 985.5 kB/s eta 0:00:01
   -------------------- ------------------- 0.8/1.5 MB 1.0 MB/s eta 0:00:01
   -------------------- ------------------- 0.8/1.5 MB 1.0 MB/s eta 0:00:01
   --------------------------- ------------ 1.0/1.5 MB 967.3 kB/s eta 0:00:01
   ---------------------------------- ----- 1.3/1.5 MB 944.7 kB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 904.2 kB/s eta 0:00:00
Downloading regex-2