In [1]:
import json
import nltk 
import numpy as np
import random
import string

In [2]:
path_feature_database = 'data/all_extracted_features.json'
with open(path_feature_database, "r", encoding="utf-8") as file:
    corpus_data = json.load(file)

In [9]:
# f = open(r'C:\Users\34640\Desktop\VSCode\z_Cursos_Python\chatbots\data\all_extracted_features.txt', 'r', errors= 'ignore')
# raw = f.read()

In [21]:
# print(raw[:750])

[
    {
        "mood_1": "Weightlessness",
        "text_1": "Input (Mood: Anxious, restless):\n\nI can't seem to sit still, my mind racing with \"what ifs\" and worst-case scenarios.  A low, throbbing bassline would capture that feeling of unease perfectly, maybe with high-pitched strings weaving in and out, mirroring the chaotic thoughts.  It needs to be fast, frantic even, but with a driving rhythm to keep the anxiety from overwhelming everything.  I need something to ground me, but something that acknowledges this feeling.\n",
        "features_1": "```json\n{\n  \"Tempo\": \"140 bpm\",\n  \"Intensity/Dynamics\": \"mf - crescendo to ff during the \"what ifs\" section, then diminuendo to mp\",\n  \"Timbre\": \"Dark, with a focus on low 


Ensuring that our raw corpus retains '\n' and similar markers is crucial, allowing our future chatbot to correctly segment sentences and paragraphs.

## prepropecing the corpus

This stage will involve several NLP techniques, including tokenization, stop word removal, lemmatization, and stemming

In [14]:
# nltk.download('punkt')
# nltk.download('wordnet') 
# make sur to have this libraries 

In [29]:
#extracts the preprocessed texts from the corpus to maintain the reference to their features
corpus_texts = []
corpus_features = []  

for entry in corpus_data:
    for key in entry:
        if key.startswith("text_"):  
            corpus_texts.append(entry[key]) 

for entry in corpus_data:
    for key in entry:
        if key.startswith("features_"):  
            corpus_features.append(entry[key])
          

We subdivide the raw corpus to facilitate later vectorization of both the users' texts in the corpus and the new input from the person interacting with the bot. This allows us to identify the most similar text using cosine distance and retrieve its corresponding features

In [30]:
corpus_texts[2]

"Overwhelmed.  A cacophony of noise in my head, a frantic rhythm that won't slow down.  I need music to match – something chaotic but ultimately resolving, maybe a crescendo into a quiet, peaceful ending.  Please, something to calm this storm.\n"

In [32]:
corpus_features[2]

'```json\n{\n  "Tempo": "Allegro molto (very fast) initially, gradually slowing to Adagio (slow)",\n  "Intensity/Dynamics": "Fortissimo (very loud) crescendo to pianissimo (very soft)",\n  "Timbre": "Initially brassy and harsh, gradually softening to strings and woodwinds with a mellow tone",\n  "Rhythm": "Initially irregular and complex polyrhythms, gradually resolving into a simple, regular beat",\n  "Harmonic progression": "Cm - B♭ - A♭ - G♭ - F - Eb - D♭ - C (Descending chromatic scale resolving to tonic)",\n  "Melody": "Initially erratic and disjointed leaps, gradually becoming smoother and descending towards the end",\n  "Tonality/Mode": "C minor, moving towards C major at the end",\n  "Articulation": "Initially staccato and aggressive, gradually becoming legato and flowing",\n  "Silence": "Long pause at the very end"\n}\n```\n'

In [33]:
lemmatizer = nltk.stem.WordNetLemmatizer()
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)

In [34]:
def preprocess_text(text):
    ''' 
    tokenizes, removes punctuation and lemmatizes
    '''
    tokens = nltk.word_tokenize(text.lower().translate(remove_punct_dict))
    return " ".join(lemmatizer.lemmatize(token) for token in tokens)

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus_texts)

In [36]:
def chatbot_response(user_input, features= corpus_features):
    """ Encuentra la respuesta más relevante en el corpus usando TF-IDF y similitud coseno """
    
    processed_input = preprocess_text(user_input)

    user_vector = vectorizer.transform([processed_input])

    similarities = cosine_similarity(user_vector, tfidf_matrix)

    # find best index to return the features
    best_match_idx = np.argmax(similarities)
    best_match_features = features[best_match_idx]

    return best_match_features

---

## Usage example

In [48]:
%pip install langdetect python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



Usage:   
  c:\Users\34640\AppData\Local\Programs\Python\Python312\python.exe -m pip install [options] <requirement specifier> [package-index-options] ...
  c:\Users\34640\AppData\Local\Programs\Python\Python312\python.exe -m pip install [options] -r <requirements file> [package-index-options] ...
  c:\Users\34640\AppData\Local\Programs\Python\Python312\python.exe -m pip install [options] [-e] <vcs project url> ...
  c:\Users\34640\AppData\Local\Programs\Python\Python312\python.exe -m pip install [options] [-e] <local project path> ...
  c:\Users\34640\AppData\Local\Programs\Python\Python312\python.exe -m pip install [options] <archive url/path> ...

no such option: -m


In [54]:
from langdetect import detect
import threading


In [55]:
timeout = 30  # Tiempo límite en segundos
stop_event = threading.Event()  # Evento para controlar el tiempo de espera

def countdown_timer():
    """Función que espera el tiempo límite y termina el programa si no hay input."""
    global stop_event
    while not stop_event.wait(timeout):
        print("\n🕑 Timeout reached! No input received. Exiting...")
        exit()

# Iniciar el temporizador en segundo plano
timer_thread = threading.Thread(target=countdown_timer, daemon=True)
timer_thread.start()

In [None]:
while True:
    user_input = input("\n👤 Share how you're feeling or what's on your mind (or type 'exit' to close this session): ")

    stop_event.set()
    stop_event.clear() #restart

    print(f"\n👤💬 {user_input} ")

    if user_input.lower() == "exit":
        print("\n🤖 good bye!")
        break

    # elif user_input.lower().replace(" ", "") == "thakyou":
    #     print("\n🤖 You'r wellcome")
        

    try:
        if user_input.lower().replace(" ", "") == "thakyou":
            print("\n🤖 You'r wellcome")

        language = detect(user_input)  
        if language != "en":
            print("\n🤖❌ Sorry, I can't understand your text. It must be written in English.")
            continue  
        
        print("\n🤖 We are processing your feelings to return the best musical match for you... ⏳")
        response = chatbot_response(user_input)
        print("\n🤖 here it is!. \nThis are the features that have been considered the most accurate for your text:\n")
        print(json.dumps(response, indent=4, ensure_ascii=False))
        print('hope it helped :)')


    except:
        print("\n🤖❌ Sorry, I couldn't detect the language of your text. Please try again.")


👤💬 12345 

🤖❌ Sorry, I couldn't detect the language of your text. Please try again.

👤💬 hola caracola 

🤖❌ Sorry, I can't understand your text. It must be written in English.

👤💬 exit 

🤖 good bye!
