Importing required libraries and preprocessing the text

In [27]:
import re
import spacy
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from gensim import corpora, models
from gensim.models import LdaModel
from nltk.tokenize import sent_tokenize




# Download necessary NLP datasets
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

# Load NLP models
nlp = spacy.load("en_core_web_sm")
stop_words = set(stopwords.words('english'))

# Preprocessing Function
def preprocess_text(text):
    """Cleans text by removing punctuation, tokenizing sentences, and removing stopwords."""
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Removing punctuation
    sentences = sent_tokenize(text)  # Tokenizing sentences
    clean_sentences = []

    for sent in sentences:
        tokens = [word.lower() for word in sent.split() if word.lower() not in stop_words]
        clean_sentences.append(" ".join(tokens))  # Fix: Missing space in join()

    return clean_sentences




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\anush\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anush\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\anush\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Task Indentification

In [53]:
def identical_task(sentences):
    """Identifies sentences containing action-related keywords."""
    task_keywords = {"must", "do", "have to", "need to", "needs to", "should", "required to", 
                     "scheduled to", "is due", "due", "complete", "submit", "finish", "pay"}
    
    identified_tasks = []
    
    for sent in sentences:
        doc = nlp(sent)
        verbs = {token.lemma_ for token in doc if token.pos_ == "VERB"}
        
        # If sentence has an action keyword or verb, consider it a task
        if any(kw in sent for kw in task_keywords) or verbs:
            identified_tasks.append(sent)
    
    return identified_tasks

Finding assigned person and their deadline 

In [None]:
def extractEntities(task_sentences):
    """Extracts person responsible and deadline from sentences."""
    extracted_data = []

    for sent in task_sentences:
        doc = nlp(sent)
        person = None
        deadline = None

        for ent in doc.ents:
            if ent.label_ == "PERSON":
                person = ent.text
            if ent.label_ in ["DATE", "TIME"]:
                deadline = ent.text

        extracted_data.append({"Task": sent, "Assigned to": person, "Deadline": deadline})

    return extracted_data

Categorizing dynamically using topic modeling (LDA)

In [None]:
def categorizeTask(task_sentences):
    """Uses LDA topic modeling to categorize tasks."""
    tokenized_tasks = [sent.split() for sent in task_sentences]
    dictionary = corpora.Dictionary(tokenized_tasks)
    corpus = [dictionary.doc2bow(text) for text in tokenized_tasks]

    # Train LDA model
    lda_model = LdaModel(corpus, num_topics=3, id2word=dictionary, passes=10)

    # Assign topics to tasks
    categorized_tasks = []
    for i, task in enumerate(task_sentences):
        topic = max(lda_model[corpus[i]], key=lambda x: x[1])[0]
        categorized_tasks.append({"Task": task, "Category": f"Category {topic}"})

    return categorized_tasks

Running  the Pipeline

In [None]:
def extractAndCategorizeTask(text):
    """Pipeline: preprocess -> extract tasks -> extract entities -> categorize tasks."""
    preprocessed_sentences = preprocess_text(text)
    task_sentences = identical_task(preprocessed_sentences)
    extracted_entities = extractEntities(task_sentences)
    categorized_tasks = categorizeTask(task_sentences)

    # Merge extracted tasks with categories
    final_output = []
    for i, task in enumerate(extracted_entities):
        task["Category"] = categorized_tasks[i]["Category"]
        final_output.append(task)

    return final_output

In [61]:
input_text = """
 "Rahul wakes early every day, goes to college in the morning, and comes back at 3 pm.",
    "At present, Rahul is outside buying snacks for us.",
    "John must submit the assignment by tomorrow evening.",
    "Sarah needs to pay rent by the 1st of next month."
"""

# Run Function
tasks = extractAndCategorizeTask(input_text)
print(tasks)

[{'Task': 'rahul wakes early every day goes college morning comes back 3 pm present rahul outside buying snacks us john must submit assignment tomorrow evening sarah needs pay rent 1st next month', 'Assigned to': 'sarah', 'Deadline': 'early every day', 'Category': 'Category 1'}]
