In [1]:
# Imoport all used libraries.
import pandas as pd
import json
import re
import random
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet
import time

In [2]:
# Download nltk packages for words in english version.
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("averaged_perceptron_tagger")
nltk.download("averaged_perceptron_tagger_eng")

[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/jovyan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data] 

True

In [3]:
# Create function to open json intents file. 
def read_json(filepath):
    '''
    Function to open a file using "open" built in function. 
    It loads the file from its filepath with "raw" (r) path.
    '''
    with open(filepath, "r") as f:
        data = json.load(f)

    df_intents = pd.DataFrame(data["intents"])
    df_catalog = pd.DataFrame(data["netflix_catalog"])

    return df_intents, df_catalog

# Apply function.
df_intents, df_catalog = read_json("intents.json")

# Transform the df into a list of dictionaries.
# Orient as records determines the type of the values of the dictionary, where list like [{column -> value}, â€¦ , {column -> value}].
# I could access each dicionary use indexes ([0], etc.).
list_dict_intents = df_intents.to_dict(orient="records")

Now, create two functions, the first one where the dictionary from json is collected and returns a new dictionary with tag and regex patterns.

The second one, the same however the regex.

In [4]:
# DATA PROCESSING TECHNIQUE: transform patterns into dictionary.
def build_regex2int(list_dict_intents):
    '''
    Function to create a dictionary with key value pairs, where the key is the patterns from the JSON with regex (as string), and value the tag.
    '''

    # Create an empty dictionary.
    regex2int = {}

    # For each dictionary in list_dict_intents, get the values from tag and pattern keys.
    for intent in list_dict_intents:
        tag = intent["tag"] # This output is list with a single string value.
        patterns = intent["patterns"] # This output is a list of strings. E.g. ['Hi', 'How are you?', 'Hello', 'Good day', 'Whats up']

        # Create regex as a single pattern.
        # r is the raw string (avoiding problems with "\" nor "/" etc.)
        # \b is the word boundary.
        # " + "|" = space plus space and "|"
        # Patterns is a list of strings, re.scape receives a string and returns a string. It adds "\" before special symbols to be treated as regular text. Sp
        # In Python, "\\" represents a single "\", there fore
        # Map apply one function in each element from a sequence. Regex treats spaces as a literal character. Therefore before space, it will add "\\".
        regex_pattern = r"\b(" + "|".join(map(re.escape, patterns)) + r")\b"

        # From the dicionary created in the beginning of the function, create a key with the regex output and the value the tag.
        regex2int[regex_pattern] = tag

    return regex2int

# DATA PROCESSING TECHNIQUE: transform patterns into dictionary.
def build_int2res(list_dict_intents):
    '''
    Function to create a dictionary with tag as key and responses as value.
    '''

    # Create an empty dictionary.
    int2res = {}

    # For each dictionary in list_dict_intents, get the values from tag and pattern keys.
    for intent in list_dict_intents:

        # Assign a variable with the tag (str).
        tag = intent["tag"]

        # Assign a variable with the responses (str).
        responses = intent["responses"]

        # Add in the dictionary created in the beginning of the function the tag as key and responses as value.
        int2res[tag] = responses

    return int2res

In [5]:
# Create stopwords and removing exceptions.
exceptions = {"after", "before"}
stop_words = set(stopwords.words("english")) - exceptions

In [6]:
lemmatizer = WordNetLemmatizer()
preprocess = WordNetLemmatizer()

# DATA PROCESSING TECHNIQUE: tokenize data using NLTK.
def tokenize_text(text):
    tokens = re.findall(r"\b\w+\b", text.lower())
    return tokens

# DATA PROCESSING TECHNIQUE: stopwords removal using NLTK.
def stopwords_removal(tokens):
    return [t for t in tokens if t not in stop_words]

# DATA PROCESSING TECHNIQUE: lemmatization using NLTK.
def lemmatize_text(tokens):
    cleaned_tokens = []

    for word, tag in pos_tag(tokens):

        # map POS tag
        if tag.startswith("J"):
            wn_tag = wordnet.ADJ
        elif tag.startswith("V"):
            wn_tag = wordnet.VERB
        elif tag.startswith("N"):
            wn_tag = wordnet.NOUN
        elif tag.startswith("R"):
            wn_tag = wordnet.ADV
        else:
            wn_tag = wordnet.NOUN

        # lemmatize with correct POS
        lemma = lemmatizer.lemmatize(word, pos=wn_tag)
        cleaned_tokens.append(lemma)

    return cleaned_tokens

# DATA PROCESSING TECHNIQUE: orchenstration using NLTK.
def preprocess_text(text):
    # Apply tokenization.
    tokens = tokenize_text(text)

    # Apply lemmatization.
    tokens_lemmatized = lemmatize_text(tokens)

    # Apply remove stopwords
    tokens_removed_stopwords = stopwords_removal(tokens_lemmatized)

    # Return
    return " ".join(tokens_removed_stopwords)

In [7]:
# Apply both functions to the dictionary from JSON.
regex2int = build_regex2int(list_dict_intents)
int2res = build_int2res(list_dict_intents)

# Print to checkoutput.
print(f"regex2int (New dictionary with regex applied in patterns) - {regex2int}\n")
print(f"regex2int (New dictionary without regex applied in patterns) - {int2res}")

regex2int (New dictionary with regex applied in patterns) - {'\\b(Hi|How\\ are\\ you\\?|Hello|Good\\ day|Whats\\ up)\\b': 'greeting', '\\b(movie|tv\\ show)\\b': 'inform_content_type', '\\b(hours|seconds|minutes|seasons)\\b': 'duration', '\\b(country)\\b': 'country', '\\b(cya|see\\ you|bye\\ bye|See\\ you\\ later|Goodbye)\\b': 'goodbye'}

regex2int (New dictionary without regex applied in patterns) - {'greeting': ["Hello again! My name is Muvie and I'm here to help you select a movie or TV show. \nFirst, tell me if you want to watch a movie or a TV show."], 'inform_content_type': ['Great. You chose content_type. What is the max duration_or_seasons you expect to watch? \nREMEMBER, if movie, tell me if you are talking about hours or minutes!'], 'duration': ["Great. I'll search for a program up to duration_selected. \nNow, tell me if you want this tittle to be released after or before a specific year. \nPlease type: after <year> or before <year>"], 'country': ["Excellent. I'll search for a

In [8]:
# Function to select movie in final state.
def select_movie(df, user_state):
    '''
    Function to apply proper filters into the dataset according to user output.
    '''

    # Import CSV netflix titles using Pandas.
    df = df.copy()

    # Perform data engineering to properly define data types.
    df["type"] = df["type"].astype(str)
    df["country"] = df["country"].astype(str)
    df["duration"] = df["duration"].astype(str)
    df["release_year"] = pd.to_numeric(df["release_year"], errors="coerce")

    # Drop nulls from release year, since it's not possible to have nulls in the dataset for proper filtering.
    df = df.dropna(subset=["release_year"])

    # If conditions according to user ouptu.
    # Condition movie or tv show.
    if user_state["content_type"] == "movie":
        df = df[df["type"] == "Movie"]
    elif user_state["content_type"] == "tv show":
        df = df[df["type"] == "TV Show"]

    # Condition before or after the selected year.
    if user_state["after_before_year"] == "after":
        df = df[df["release_year"] > user_state["year"]]
    elif user_state["after_before_year"] == "before":
        df = df[df["release_year"] < user_state["year"]]

    # Condition to select the country.
    if user_state["country"] is not None:
        df = df[df["country"].fillna("").str.lower().str.contains(user_state["country"].lower())]

    # Condition to search engine of duration. Collect duration and transform to minutes. Only applicable for movie.
    if user_state["content_type"] == "movie" and user_state["duration"] is not None:

        duration_value = int(user_state["duration"].split()[0])
        duration_minutes = duration_value * 60

        df["duration_minutes"] = (
            df["duration"]
            .str.extract(r"(\d+)")
            .astype(float)
        )

        # Filter engine.
        df = df[df["duration_minutes"] <= duration_minutes]

    # Return dataframe.
    return df

In [None]:

#### Function to get user input and respond.
def chatbot():
    '''
    Main function for the chatbot. 
    '''

    # User welcome and termination instruction.
    print("Chatbot: Welcome!")
    print("Chatbot: My name is Muvie and I'm here to help you select what to watch.")
    print("Chatbot: If you want to end this session, just type 'bye'.")
    print("Chatbot: Let's start by definig what do you want to watch: movie or a TV Show?")

    # Data structure used to store the chronological sequence of user inputs.
    user_input_queue = []

    # State variables. It will be used to define what is the output in each phase.
    content_type = None
    duration_selected = None
    expecting_duration = False
    expecting_year = False
    expecting_country = False

    # User state (used later to filter the dataset).
    dict_user_state = {
        "content_type": None,
        "duration": None,
        "year": None,
        "after_before_year": None,
        "country": None
    }

    # Main loop:
    while True:
        
        # Get user input.
        user_input = input("You: ")

        # Append user input into a list (using base data strucure a queue) to have all data collected.
        user_input_queue.append(user_input)

        # Check if the user wants to exit.
        if user_input in ["exit", "quit", "bye", "goodbye"]:
            print("Chatbot: Thank you for your interaction.")
            break

        # Restart conversation.
        if preprocess_text(user_input) in ["restart", "reset"]:
            print("Chatbot: Ok, lets start all over again.")

            # Reset state variables.
            content_type = None
            duration_selected = None
            expecting_duration = False
            expecting_year = False
            expecting_country = False

            # Reset user state.
            dict_user_state = {
                "content_type": None,
                "duration": None,
                "year": None,
                "after_before_year": None,
                "country": None
            }

            # Clear input history.
            user_input_queue = []

            print("Chatbot: Do you want to watch a movie or a TV show?")
            continue

        # Handle year input (after/before).
        if expecting_year:
            year_match = re.search(r"\b(after|before)\s+(\d{4})\b", user_input.lower())

            # If year_match is True
            if year_match is not None:
                dict_user_state["after_before_year"] = year_match.group(1)
                dict_user_state["year"] = int(year_match.group(2))

                # Print chatbot message.
                print(
                    f"Chatbot: Got it. I'll search for titles {dict_user_state['after_before_year']} {dict_user_state['year']}."
                )

                # Set variables.
                expecting_year = False
                expecting_country = True
                print(
                    "Chatbot: Finally, inform the country you would like to watch.\n"
                    "IMPORTANT: Type ONLY the country name.\n"
                )
                continue
            else:
                print("Chatbot: Please type `after <year>` or `before <year>` (e.g., after 2015).")
                continue

        # Handle country input
        if expecting_country:
            dict_user_state["country"] = user_input.strip()

            # Generate a response associated with the country intent.
            var_output = random.choice(int2res["country"])
            print("Chatbot:", var_output)
            time.sleep(3)
            print("Chatbot: Searching...")
            time.sleep(3)
            print("Chatbot: Almost there!")
            time.sleep(3)

            # Call the recommendation function
            df_result = select_movie(df_catalog, dict_user_state)

            # Handle the case where no matching titles are found.
            if df_result.empty:
                print("Chatbot: Sorry, I couldn't find any titles with these characteristics.")

                print("Chatbot: Let's start over again.")

                # Reset state variables
                content_type = None
                duration_selected = None
                expecting_duration = False
                expecting_year = False
                expecting_country = False

                # Reset user state
                dict_user_state = {
                    "content_type": None,
                    "duration": None,
                    "year": None,
                    "after_before_year": None,
                    "country": None
                }

                # Clear input history
                user_input_queue = []

                print("Chatbot: Do you want to watch a movie or a TV show?")
                continue

            else:
                # show up to n results (max 5).
                recommendations = df_result.sample(min(5, len(df_result)))
                
                print(f"Chatbot: Here are {recommendations.shape[0]} recommendations for you:")

                for _, row in recommendations.iterrows():
                    print(
                        f"- {row['title']}, released in {row['release_year']}, "
                        f"from {row['country']} with a duration of {row['duration']}."
                    )

                print("Chatbot: Type RESTART to start over again.")

            expecting_country = False

            continue

        # Detect content type from user input
        # The "r" means raw string. It avoids inverted bars to break.
        # \b represents a word boudary.
        # Therefore by writting \bmovie\b means that movie must be a word by itself.
        if re.search(r"\bmovie\b", preprocess_text(user_input)):
            content_type = "movie"
            dict_user_state["content_type"] = "movie"
            expecting_duration = True

        elif re.search(r"\btv show\b|\bseries\b", preprocess_text(user_input)):
            content_type = "tv show"
            dict_user_state["content_type"] = "tv show"
            expecting_duration = True

        # Extract numeric duration. It's ouside because user could write "Movie with 2 hours".
        # \b is the word boundary.
        # (\d+) = \d (any digit [0-9]), + (one or more).
        number_match = re.search(r"\b\d+\b", preprocess_text(user_input))

        # Only treat numbers as duration if we are explicitly expecting it
        if (number_match and content_type is not None and expecting_duration == True and duration_selected is None):

            # Collect number from regex.
            value = number_match.group()

            # Movie logic: check explicit unit
            if content_type == "movie":
                if "minute" in preprocess_text(user_input) and "hour" in preprocess_text(user_input):
                    print("Chatbot: Please inform only the quantity hours or minutes.")
                    print("Chatbot: Not both together.")
                    print("Chatbot: Lets start all over again.")

                    # Reset state variables.
                    content_type = None
                    duration_selected = None
                    expecting_duration = False
                    expecting_year = False
                    expecting_country = False

                    # Reset user state.
                    dict_user_state = {
                        "content_type": None,
                        "duration": None,
                        "year": None,
                        "after_before_year": None,
                        "country": None
                    }

                    # Clear input history.
                    user_input_queue = []

                    print("Chatbot: Do you want to watch a movie or a TV show?")
                    continue

                elif "minute" in preprocess_text(user_input):
                    duration_selected = f"{value} minutes"
                    dict_user_state["duration"] = duration_selected
                elif "hour" in preprocess_text(user_input):
                    duration_selected = f"{value} hours"
                    dict_user_state["duration"] = duration_selected
                else:
                    # fallback when no unit is specified.
                    print("Chatbot: Please tell me if the duration is in hours or minutes.")
                    continue

            # TV show logic: always seasons
            elif content_type == "tv show":
                duration_selected = f"{value} seasons"
                dict_user_state["duration"] = duration_selected

            # Respond immediately using duration intent
            var_output = random.choice(int2res["duration"])

            # Handle duration_selected placeholder
            if "duration_selected" in var_output:
                var_output = var_output.replace("duration_selected", duration_selected)

            print("Chatbot:", var_output)

            # Duration successfully captured
            expecting_duration = False
            expecting_year = True
            continue

        # Search for a pattern in the intents
        response_found = False

        # Iterate over regex -> tag mapping
        for regex, tag in regex2int.items():

            # Check if user input matches the regex
            if re.search(regex, preprocess_text(user_input), re.IGNORECASE):

                var_output = random.choice(int2res[tag])

                # Handle content_type placeholder
                if "content_type" in var_output and content_type is not None:
                    var_output = var_output.replace("content_type", content_type)

                # Handle duration_or_seasons placeholder
                if "duration_or_seasons" in var_output and content_type is not None:
                    if content_type == "movie":
                        var_output = var_output.replace("duration_or_seasons", "duration")
                    elif content_type == "tv show":
                        var_output = var_output.replace("duration_or_seasons", "seasons")

                    # chatbot is now waiting for duration input
                    expecting_duration = True

                print("Chatbot:", var_output)

                response_found = True
                break

        if response_found == False:
            print("Chatbot: Sorry, could you rephrase that again?")


# Run the chatbot
chatbot()

Chatbot: Welcome!
Chatbot: My name is Muvie and I'm here to help you select what to watch.
Chatbot: If you want to end this session, just type 'bye'.
Chatbot: Let's start by definig what do you want to watch: movie or a TV Show?
You: I want to watch a movie :)
Chatbot: Great. You chose movie. What is the max duration you expect to watch? 
REMEMBER, if movie, tell me if you are talking about hours or minutes!
You: Around 2 hours
Chatbot: Great. I'll search for a program up to 2 hours. 
Now, tell me if you want this tittle to be released after or before a specific year. 
Please type: after <year> or before <year>
You: After 1990
Chatbot: Got it. I'll search for titles after 1990.
Chatbot: Finally, inform the country you would like to watch.
IMPORTANT: Type ONLY the country name.

You: United Kingdom
Chatbot: Excellent. I'll search for a program with these characteristics, give me a second.
Chatbot: Searching...
Chatbot: Almost there!
Chatbot: Here are 5 recommendations for you:
- The Int

## 