In [1]:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
# Load the spaCy English model
nlp = spacy.load('en_core_web_sm')

# Example data
biography = """
John Smith is a professor of Computer Science at XYZ University. He received his Ph.D. in Computer Science from ABC University in 2005. Prior to joining XYZ University, he worked as a research scientist at DEF Labs. His research interests include machine learning, natural language processing, and data mining. Professor Smith is a member of the Association for Computing Machinery (ACM) and has received several awards for his contributions to the field.
"""

In [3]:
# Define the categories you want to extract
categories = ['education', 'professional', 'interests', 'affiliation', 'awards']

# Train a text classifier
training_data = [
    ("John Smith received his Ph.D. in Computer Science from ABC University in 2005.", "education"),
    ("Prior to joining XYZ University, he worked as a research scientist at DEF Labs.", "professional"),
    ("His research interests include machine learning, natural language processing, and data mining.", "interests"),
    ("Professor Smith is a member of the Association for Computing Machinery (ACM).", "affiliation"),
    ("He has received several awards for his contributions to the field.", "awards")
]

In [4]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform([text for text, _ in training_data])
y_train = [label for _, label in training_data]

classifier = LogisticRegression()
classifier.fit(X_train, y_train)

# Process the biography and extract information
doc = nlp(biography)
sentences = [sent.text for sent in doc.sents]

for sentence in sentences:
    X_test = vectorizer.transform([sentence])
    predicted_category = classifier.predict(X_test)[0]
    if predicted_category in categories:
        entities = [(ent.text, ent.label_) for ent in nlp(sentence).ents]
        print("Category:", predicted_category)
        print("Entities:", entities)
        print()

Category: affiliation
Entities: [('John Smith', 'PERSON'), ('Computer Science', 'ORG'), ('XYZ University', 'ORG')]

Category: education
Entities: [('Ph.D. in Computer Science', 'WORK_OF_ART'), ('ABC University', 'ORG'), ('2005', 'DATE')]

Category: professional
Entities: [('XYZ University', 'ORG'), ('DEF Labs', 'ORG')]

Category: interests
Entities: []

Category: affiliation
Entities: [('Smith', 'PERSON'), ('the Association for Computing Machinery', 'ORG'), ('ACM', 'ORG')]



In [5]:
from nltk.tokenize import sent_tokenize

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# Example data
biography = """
 He received his Ph.D. in Computer Science from ABC University in 2005. Prior to joining XYZ University, he worked as a research scientist at DEF Labs. His research interests include machine learning, natural language processing, and data mining. John Smith is a professor of Computer Science at XYZ University.  Professor Smith is a member of the Association for Computing Machinery (ACM) and has received several awards for his contributions to the field.
"""

# Define the categories you want to extract
categories = ['education', 'professional', 'interests', 'affiliation', 'awards']

# Train a text classifier
training_data = [
    ("John Smith received his Ph.D. in Computer Science from ABC University in 2005.", "education"),
    ("Prior to joining XYZ University, he worked as a research scientist at DEF Labs.", "professional"),
    ("His research interests include machine learning, natural language processing, and data mining.", "interests"),
    ("Professor Smith is a member of the Association for Computing Machinery (ACM).", "affiliation"),
    ("He has received several awards for his contributions to the field.", "awards")
]

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform([text for text, _ in training_data])
y_train = [label for _, label in training_data]

classifier = LogisticRegression()
classifier.fit(X_train, y_train)

# Process the biography and classify each sentence
sentences = [sentence.strip() for sentence in biography.split('.') if sentence.strip()]
#sentences = sent_tokenize(biography)


for sentence in sentences:
    X_test = vectorizer.transform([sentence])
    predicted_category = classifier.predict(X_test)[0]
    if predicted_category in categories:
        print("Category:", predicted_category)
        print("Sentence:", sentence)
        print()


Category: education
Sentence: He received his Ph

Category: interests
Sentence: D

Category: education
Sentence: in Computer Science from ABC University in 2005

Category: professional
Sentence: Prior to joining XYZ University, he worked as a research scientist at DEF Labs

Category: interests
Sentence: His research interests include machine learning, natural language processing, and data mining

Category: affiliation
Sentence: John Smith is a professor of Computer Science at XYZ University

Category: affiliation
Sentence: Professor Smith is a member of the Association for Computing Machinery (ACM) and has received several awards for his contributions to the field



In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# Example data
biography = """
 John Smith is a professor of Computer Science at XYZ University. He received his Ph.D. in Computer Science from ABC University in 2005. Prior to joining XYZ University, he worked as a research scientist at DEF Labs. His research interests include machine learning, natural language processing, and data mining. Professor Smith is a member of the Association for Computing Machinery (ACM) and has received several awards for his contributions to the field.
"""

# Define the categories you want to extract
categories = ['education', 'professional', 'interests', 'affiliation', 'awards']

# Train a text classifier
training_data = [
    ("John Smith received his Ph.D. in Computer Science from ABC University in 2005.", "education"),
    ("Prior to joining XYZ University, he worked as a research scientist at DEF Labs.", "professional"),
    ("His research interests include machine learning, natural language processing, and data mining.", "interests"),
    ("Professor Smith is a member of the Association for Computing Machinery (ACM).", "affiliation"),
    ("He has received several awards for his contributions to the field.", "awards")
]

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform([text for text, _ in training_data])
y_train = [label for _, label in training_data]

classifier = LogisticRegression()
classifier.fit(X_train, y_train)

# Process the biography and classify each sentence
sentences = [sentence.strip() for sentence in biography.split('.') if sentence.strip()]

for sentence in sentences:
    X_test = vectorizer.transform([sentence])
    predicted_category = classifier.predict(X_test)[0]
    if predicted_category in categories:
        print("Category:", predicted_category)
        print("Sentence:", sentence)
        print()


Category: affiliation
Sentence: John Smith is a professor of Computer Science at XYZ University

Category: education
Sentence: He received his Ph

Category: interests
Sentence: D

Category: education
Sentence: in Computer Science from ABC University in 2005

Category: professional
Sentence: Prior to joining XYZ University, he worked as a research scientist at DEF Labs

Category: interests
Sentence: His research interests include machine learning, natural language processing, and data mining

Category: affiliation
Sentence: Professor Smith is a member of the Association for Computing Machinery (ACM) and has received several awards for his contributions to the field

