In [0]:
pd.set_option('display.max_colwidth', None)
# Load the required libraries
import os
import torch
import dataiku
import pandas as pd
from transformers import pipeline
from dataiku import pandasutils as pdu
from transformers import AutoTokenizer, AutoModelForTokenClassification


In [0]:
enhanced_tweets_informations = dataiku.Dataset("enhanced_tweets_informations")
df = enhanced_tweets_informations.get_dataframe()


In [0]:
hf_transformers_home_dir = os.getenv("HF_HOME")


In [0]:
print(hf_transformers_home_dir)

In [0]:

# Read recipe inputs
enhanced_tweets_informations = dataiku.Dataset("enhanced_tweets_informations")
df = enhanced_tweets_informations.get_dataframe()


# Compute recipe outputs from inputs
# Define the model to use
model_name = "dslim/bert-base-NER"

def perform_ner_inference(model_name, input_df):
    """
    perform_ner_inference performs NER inference on a dataframe using a specified Hugging Face model.
    
    :param model_name: The name of the Hugging Face model to use for NER.
    :param input_df: The input dataframe with at least two columns, document_id and text.
    :return: pd.DataFrame. A dataframe containing the NER results, with at least columns "document_id", "text", and "predicted_labels".
    """
    # Load the pre-trained tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=hf_transformers_home_dir)
    model = AutoModelForTokenClassification.from_pretrained(model_name, cache_dir=hf_transformers_home_dir)

    # Load the token classification pipeline
    token_classification_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="first") # pass device=0 if using gpu

    # Perform token classification on each row of the dataframe
    predicted_labels = []
    for index, row in df.iterrows():
        document_id = row["id"]
        text = row["text"]
        results = token_classification_pipeline(text)
        predicted_labels.append(results)
        
    df['predicted_labels'] = predicted_labels

    return df

document_scored_df = perform_ner_inference(model_name, df)

document_scored_df.head()

In [0]:

tweets_NER_with_Python_code_df = document_scored_df # For this sample code, simply copy input to output


# Write recipe outputs
tweets_NER_with_Python_code = dataiku.Dataset("tweets_NER_with_Python_code")
tweets_NER_with_Python_code.write_with_schema(tweets_NER_with_Python_code_df)


## Too long to load

In [0]:
import nltk
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.chunk import conlltags2tree, tree2conlltags
nltk.download('maxent_ne_chunker_tab')
nltk.download('words')

# Function to extract named entities from text
def extract_entities(text):
    words = word_tokenize(text)
    tagged = pos_tag(words)
    entities = ne_chunk(tagged)
    return entities

# Apply the function to the text column in your DataFrame
df = df.sample(n=10, random_state=42)

df['named_entities'] = df['text'].apply(extract_entities)


In [0]:
df.head()