In [2]:
def run_tests(func, test_cases):
    for i, args in enumerate(test_cases):
        result = func(*args)
        print(f"Case {i+1}: {args} -> {result}")

In [3]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# Ensure you have downloaded the necessary NLTK data
nltk.download('vader_lexicon')

def sentiment_analysis(text):
    """Analyzes sentiment of text.
    Args:
        text (str): Text to analyze
    Returns:
        float: Compound sentiment score (-1 to 1)
    """
    sia = SentimentIntensityAnalyzer()
    sentiment = sia.polarity_scores(text)
    return sentiment['compound']

test_cases = [
    ["I love this product!"],           # -> positive score
    ["This is terrible."],              # -> negative score
    ["The weather is nice today."],     # -> slightly positive
    ["I am so angry right now!"],       # -> negative score
    ["This seems neutral."]             # -> neutral score
]

# Excel usage: =SENTIMENT_ANALYSIS("I love this product!")

run_tests(sentiment_analysis, test_cases)

Case 1: ['I love this product!'] -> 0.6696
Case 2: ['This is terrible.'] -> -0.4767
Case 3: ['The weather is nice today.'] -> 0.4215
Case 4: ['I am so angry right now!'] -> -0.5974
Case 5: ['This seems neutral.'] -> 0.0


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\brent\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [4]:
def classify_intent(text):
    """Classify the intent of the input text using NLTK's NaiveBayesClassifier.
    Args:
        text (str): Text to analyze
    Returns:
        str: Classified intent (capitalized)
    """
    import nltk
    from nltk.classify import NaiveBayesClassifier
    from nltk.tokenize import word_tokenize
    nltk.download('punkt')
    
    def extract_features(text):
        """Extract features from text for classification."""
        words = word_tokenize(text.lower())
        return dict([(word, True) for word in words])
    
    # Training data with labeled intents
    training_data = [
        ("what is the weather like", "question"),
        ("what time is it", "question"),
        ("where are you from", "question"),
        ("who made this", "question"),
        ("please help me", "request"),
        ("could you assist me", "request"),
        ("show me how to", "request"),
        ("i need help with", "request"),
        ("hello there", "greeting"),
        ("hi how are you", "greeting"),
        ("good morning", "greeting"),
        ("nice to meet you", "greeting"),
        ("goodbye for now", "farewell"),
        ("see you later", "farewell"),
        ("i have to go", "farewell"),
        ("thanks for your help", "farewell"),
        ("i like this product", "statement"),
        ("the weather is nice", "statement"),
        ("this works well", "statement"),
        ("interesting idea", "statement")
    ]
    
    # Prepare and train the classifier
    featuresets = [(extract_features(text), intent) for (text, intent) in training_data]
    classifier = NaiveBayesClassifier.train(featuresets)
    
    # Classify the input text
    features = extract_features(text)
    intent = classifier.classify(features)
    return intent.capitalize()

test_cases = [
    ["What's the temperature today?"],
    ["Can you help me find my files?"],
    ["Hi, nice to meet you!"],
    ["Bye, thanks for all your help!"],
    ["The system is working perfectly."],
    ["Where did you put the documents?"],
    ["Please show me the way."]
]

# Excel usage: =CLASSIFY_INTENT("What's the temperature today?")

run_tests(classify_intent, test_cases)

Case 1: ["What's the temperature today?"] -> Question
Case 2: ['Can you help me find my files?'] -> Request
Case 3: ['Hi, nice to meet you!'] -> Greeting
Case 4: ['Bye, thanks for all your help!'] -> Farewell
Case 5: ['The system is working perfectly.'] -> Question
Case 6: ['Where did you put the documents?'] -> Question
Case 7: ['Please show me the way.'] -> Request


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\brent\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\brent\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\brent\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\brent\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\brent\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\brent\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\brent\AppData\Roaming\

In [5]:
import nltk
import numpy

# Download all required NLTK data
for package in ['punkt', 'averaged_perceptron_tagger_eng', 'maxent_ne_chunker', 'words']:
    nltk.download(package)

def named_entity_recognition(text):
    """Extract named entities from text.
    Args:
        text (str): Text to analyze
    Returns:
        list: List of named entities found
    """
    # Tokenize and tag the text
    tokens = nltk.word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    
    # Extract named entities
    named_entities = nltk.ne_chunk(pos_tags)
    entities = []
    
    # Process each chunk
    for chunk in named_entities:
        if hasattr(chunk, 'label'):
            entity = ' '.join(c[0] for c in chunk.leaves())
            entity_type = chunk.label()
            entities.append(f"{entity} ({entity_type})")
    
    return entities if entities else ['No named entities found']

test_cases = [
    ["John works at Microsoft in Seattle."],
    ["The United States and Canada signed a trade agreement."],
    ["Tesla CEO Elon Musk announced new plans."],
    ["Mount Everest is in Nepal."],
    ["Sarah visited Paris last summer."]
]

# Excel usage: =NAMED_ENTITY_RECOGNITION("John works at Microsoft in Seattle.")

run_tests(named_entity_recognition, test_cases)
# Ignore dowload error in VS Code

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\brent\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\brent\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\brent\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\brent\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


Case 1: ['John works at Microsoft in Seattle.'] -> ['John (PERSON)', 'Microsoft (ORGANIZATION)', 'Seattle (GPE)']
Case 2: ['The United States and Canada signed a trade agreement.'] -> ['United States (GPE)', 'Canada (GPE)']
Case 3: ['Tesla CEO Elon Musk announced new plans.'] -> ['Tesla (PERSON)', 'CEO Elon Musk (ORGANIZATION)']
Case 4: ['Mount Everest is in Nepal.'] -> ['Mount (PERSON)', 'Everest (ORGANIZATION)', 'Nepal (GPE)']
Case 5: ['Sarah visited Paris last summer.'] -> ['Sarah (PERSON)', 'Paris (GPE)']


In [6]:
def split_into_words(text):
    """Tokenize the given text into words.
    Args:
        text (str): Input text to tokenize
    Returns:
        list: List of words
    """
    from nltk.tokenize import word_tokenize
    
    return word_tokenize(text)

test_cases = [
    ["Natural language processing helps computers understand and work with human language."],
    ["The researchers are developing better models. They're making progress daily."]
]

# Excel usage: =SPLIT_INTO_WORDS("Natural language processing helps computers understand and work with human language.")

run_tests(split_into_words, test_cases)

Case 1: ['Natural language processing helps computers understand and work with human language.'] -> ['Natural', 'language', 'processing', 'helps', 'computers', 'understand', 'and', 'work', 'with', 'human', 'language', '.']
Case 2: ["The researchers are developing better models. They're making progress daily."] -> ['The', 'researchers', 'are', 'developing', 'better', 'models', '.', 'They', "'re", 'making', 'progress', 'daily', '.']


In [7]:
def stem_words(text):
    """Perform stemming on text.
    Args:
        text (str): Input text
    Returns:
        list: List of stemmed words
    """
    from nltk.tokenize import word_tokenize
    from nltk.stem import PorterStemmer
    import nltk
    nltk.download('punkt')
    
    words = word_tokenize(text)
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in words]

test_cases = [
    ["The researchers are developing better models. They're making progress daily."]  
    # -> ['the', 'research', 'are', 'develop', 'better', 'model', '.', 'they', "'re", 'make', 'progress', 'daili', '.']
]

# Excel usage: =STEM_WORDS("The researchers are developing better models. They're making progress daily.")

run_tests(stem_words, test_cases)

Case 1: ["The researchers are developing better models. They're making progress daily."] -> ['the', 'research', 'are', 'develop', 'better', 'model', '.', 'they', "'re", 'make', 'progress', 'daili', '.']


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\brent\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
def analyze_pos(text):
    """Tag parts of speech in text.
    Args:
        text (str): Input text
    Returns:
        list: List of strings in 'word:POS' format
    """
    from nltk import pos_tag, word_tokenize
    import nltk
    nltk.download('punkt')
    nltk.download('averaged_perceptron_tagger')
    
    words = word_tokenize(text)
    pos_tags = pos_tag(words)
    return [f"{word}:{pos}" for word, pos in pos_tags]

test_cases = [
    ["The quick brown fox jumps over the lazy dog."],  
    # -> ['The:DT', 'quick:JJ', 'brown:JJ', 'fox:NN', 'jumps:VBZ', 'over:IN', 'the:DT', 'lazy:JJ', 'dog:NN', '.:.',]
    ["I am running!"],
    # -> ['I:PRP', 'am:VBP', 'running:VBG', '!:.']
    ["Python is great."]
    # -> ['Python:NNP', 'is:VBZ', 'great:JJ', '.:.']
]

# Excel usage: =ANALYZE_POS("The quick brown fox jumps over the lazy dog.")

run_tests(analyze_pos, test_cases)

Case 1: ['The quick brown fox jumps over the lazy dog.'] -> ['The:DT', 'quick:JJ', 'brown:NN', 'fox:NN', 'jumps:VBZ', 'over:IN', 'the:DT', 'lazy:JJ', 'dog:NN', '.:.']
Case 2: ['I am running!'] -> ['I:PRP', 'am:VBP', 'running:VBG', '!:.']
Case 3: ['Python is great.'] -> ['Python:NNP', 'is:VBZ', 'great:JJ', '.:.']


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\brent\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\brent\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\brent\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\brent\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\brent\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\brent\AppData\Roaming\nltk_data

In [9]:
import pandas as pd
from nltk.metrics.distance import edit_distance, jaccard_distance, jaro_similarity
from nltk.util import ngrams

def text_similarity(lookup_value, lookup_array_df, algorithm):
    """
    Calculate the similarity between a lookup_value and the best match in a lookup_array.

    Parameters:
    lookup_value (str or pd.DataFrame): The string or DataFrame to search for.
    lookup_array_df (pd.DataFrame): The DataFrame to search within.
    algorithm (str): The algorithm to use for calculating similarity. Options are 'levenshtein', 'jaccard', and 'jaro'. Default is 'jaccard'.

    Returns:
    list: A list of lists where each sublist contains the index (1-based) and the similarity score of the most similar item in the lookup_array.
    """
    # Define a dictionary to map algorithm names to functions
    algo_funcs = {
        'levenshtein': lambda x, y: 1 - edit_distance(x, y) / max(len(x), len(y)),
        'jaccard': lambda x, y: 1 - jaccard_distance(set(ngrams(x, 2)), set(ngrams(y, 2))),
        'jaro': jaro_similarity
    }
    
    # Get the algorithm function from the dictionary
    algo_func = algo_funcs.get(algorithm)
    if algo_func is None:
        raise ValueError(f"Unsupported algorithm: {algorithm}")
    
    # Flatten the DataFrame to a list
    lookup_array = lookup_array_df.values.flatten().tolist()
    
    # Check if lookup_value is a DataFrame
    if isinstance(lookup_value, pd.DataFrame):
        lookup_value_list = lookup_value.values.flatten().tolist()
    else:
        lookup_value_list = [lookup_value]
    
    results = [] 
    for lookup_value_item in lookup_value_list:
        # Calculate similarity scores and round to 2 decimal places
        scores = [(index + 1, round(algo_func(lookup_value_item, item), 2)) for index, item in enumerate(lookup_array)]
        
        # Sort based on scores in descending order
        scores.sort(key=lambda x: x[1], reverse=True)
        # Append the top index and score to results as a list
        results.append(list(scores[0]))

    # results is 2D list, e.g. [[1, 0.75], [2, 0.85]]
    return results

test_cases = [
    ["hello", pd.DataFrame(["hello world", "hi there", "greetings"]), "levenshtein"],  # -> [1, 0.7]
    ["programing", pd.DataFrame(["programming", "coding", "development"]), "levenshtein"],  # -> [1, 0.9]
    ["python", pd.DataFrame(["python3", "javascript", "java"]), "jaccard"],  # -> [1, 0.67]
    ["artificial intelligence", pd.DataFrame(["machine learning", "artificial intel", "AI"]), "jaro"],  # -> [2, 0.85]
    ["data science", pd.DataFrame(["data analysis", "data scientist", "statistics"]), "jaccard"]  # -> [1, 0.6]
]

# Excel usage: =TEXT_SIMILARITY("hello", {"hello world", "hi there", "greetings"}, "levenshtein")

run_tests(text_similarity, test_cases)

Case 1: ['hello',              0
0  hello world
1     hi there
2    greetings, 'levenshtein'] -> [[1, 0.45]]
Case 2: ['programing',              0
0  programming
1       coding
2  development, 'levenshtein'] -> [[1, 0.91]]
Case 3: ['python',             0
0     python3
1  javascript
2        java, 'jaccard'] -> [[1, 0.83]]
Case 4: ['artificial intelligence',                   0
0  machine learning
1  artificial intel
2                AI, 'jaro'] -> [[2, 0.9]]
Case 5: ['data science',                 0
0   data analysis
1  data scientist
2      statistics, 'jaccard'] -> [[2, 0.6]]
