In [1]:
import pymongo
import pandas as pd

def connect_mongodb():
    client = pymongo.MongoClient("mongodb://localhost:27017/")
    db = client["chatbot-db"]
    return db

def load_data(db, collection_name):
    collection = db[collection_name]
    data = list(collection.find({}, {"_id": 0, "question": 1, "answer": 1}))
    return pd.DataFrame(data)

db = connect_mongodb()
collections = [
    "Male_Underweight", "Male_Normalweight", "Male_Overweight", "Male_Obesity",
    "Female_Underweight", "Female_Normalweight", "Female_Overweight", "Female_Obesity"
]

dataframes = {collection: load_data(db, collection) for collection in collections}


In [4]:
dataframes

{'Male_Underweight':                                              question  \
 0   What should I eat to gain weight in a healthy ...   
 1   How many calories should I consume daily to gain?   
 2   What are some high-protein foods to include in...   
 3                 Can I eat junk food to gain weight?   
 4              Should I take weight gain supplements?   
 5   What are the best drinks to help with weight g...   
 6                      How can I improve my appetite?   
 7   How much protein do I need daily for weight gain?   
 8     What role do carbohydrates play in weight gain?   
 9                    Should I avoid cardio exercises?   
 10  What are some quick and healthy snacks for wei...   
 11  How often should I eat to gain weight effectiv...   
 12      What is the best time to eat for weight gain?   
 13        Can I drink milk every day for weight gain?   
 14               How much water should I drink daily?   
 15              Are there any weight gain meal plan

In [6]:
def preprocess_text(df):
    df['question'] = df['question'].str.strip().str.lower()
    df['answer'] = df['answer'].str.strip().str.lower()
    return df

for collection in collections:
    dataframes[collection] = preprocess_text(dataframes[collection])

In [12]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
from datasets import Dataset

model_name = "facebook/bart-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples['question'], examples['answer'], truncation=True, padding='max_length')

# Combine all dataframes into one
combined_df = pd.concat(dataframes.values())
dataset = Dataset.from_pandas(combined_df)
tokenized_dataset = dataset.map(tokenize_function, batched=True)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
)

trainer.train()


Map:   0%|          | 0/240 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


ValueError: expected sequence of length 30 at dim 1 (got 39)

In [16]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
from datasets import Dataset
import torch

model_name = "facebook/bart-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Move model to CPU if GPU memory is an issue
device = torch.device("cpu")
model.to(device)

def tokenize_function(examples):
    model_inputs = tokenizer(examples['question'], truncation=True, padding='max_length', max_length=128)
    labels = tokenizer(examples['answer'], truncation=True, padding='max_length', max_length=128)

    # Ensure labels are correctly formatted
    labels["input_ids"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in label]
        for label in labels["input_ids"]
    ]
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Combine all dataframes into one
combined_df = pd.concat(dataframes.values())
dataset = Dataset.from_pandas(combined_df)
tokenized_dataset = dataset.map(tokenize_function, batched=True)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,  # Small batch size
    per_device_eval_batch_size=2,   # Small batch size
    num_train_epochs=3,
    weight_decay=0.01,
    gradient_accumulation_steps=4,  # Accumulate gradients over 4 steps
    fp16=False,  # Disable mixed precision training if using CPU
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
)

trainer.train()


Map:   0%|          | 0/240 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,No log,2.168801
2,No log,1.925376
3,No log,1.84479




TrainOutput(global_step=90, training_loss=2.780869377983941, metrics={'train_runtime': 52.6063, 'train_samples_per_second': 13.687, 'train_steps_per_second': 1.711, 'total_flos': 54876281241600.0, 'train_loss': 2.780869377983941, 'epoch': 3.0})

In [18]:
dataset

Dataset({
    features: ['question', 'answer', '__index_level_0__'],
    num_rows: 240
})

In [20]:
8 * 30

240

In [22]:
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")


Evaluation results: {'eval_loss': 1.8447895050048828, 'eval_runtime': 3.2319, 'eval_samples_per_second': 74.259, 'eval_steps_per_second': 37.129, 'epoch': 3.0}


In [26]:
from transformers import pipeline

summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)

def generate_summary(answer):
    summary = summarizer(answer, max_length=50, min_length=250, do_sample=False)
    return summary[0]['summary_text']

# Example usage
sample_answer = combined_df.iloc[0]['answer']
summary = generate_summary(sample_answer)
print(f"Original Answer: {sample_answer}")
print(f"Summary: {summary}")


Device set to use mps:0
Your min_length=250 must be inferior than your max_length=50.
Your max_length is set to 50, but your input_length is only 35. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=17)


Original Answer: focus on calorie-dense foods like nuts, avocados, whole grains, lean meats, and dairy. eat frequent meals and include protein-rich snacks.
Summary: focus on protein-rich foods like nuts, avocados, whole grains, lean meats, and dairy. eat a balanced diet, including fruits, vegetables, and whole grains. avoid processed foods, and avoid processed meats. focus on


In [28]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import torch

# Load the trained model and tokenizer
model_name = "facebook/bart-base"
model_path = "./results"  # Path where the model is saved
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

# Move model to CPU if necessary
device = torch.device("cpu")
model.to(device)

# Create a text generation pipeline
generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.mps.is_available() else -1)

def generate_response(question, answer):
    input_text = f"question: {question} answer: {answer}"
    response = generator(input_text, max_length=128, num_return_sequences=1)
    return response[0]['generated_text']

def main():
    print("Welcome to the AI AHAAY Meal Planner Chatbot!")

    while True:
        # Ask for gender
        gender = input("Are you male or female? (Type 'male' or 'female'): ").strip().lower()
        if gender not in ["male", "female"]:
            print("Invalid input. Please type 'male' or 'female'.")
            continue

        # Ask for weight category
        weight_category = input("Select your weight category (underweight, normalweight, overweight, obesity): ").strip().lower()
        if weight_category not in ["underweight", "normalweight", "overweight", "obesity"]:
            print("Invalid input. Please select a valid weight category.")
            continue

        # Form the collection name
        collection_name = f"{gender.capitalize()}_{weight_category.capitalize()}"

        # Load the relevant data
        data = dataframes[collection_name]

        while True:
            # Ask for the question
            question = input("Enter your question (or type 'exit' to stop): ").strip().lower()
            if question == "exit":
                break

            # Find the matching answer
            matching_row = data[data['question'].str.contains(question, case=False, na=False)]
            if not matching_row.empty:
                answer = matching_row.iloc[0]['answer']
                response = generate_response(question, answer)
                print("Response:", response)
            else:
                print("No matching answer found.")

        # Ask if the user wants to continue
        continue_chat = input("Do you want to ask more questions? (Type 'yes' or 'no'): ").strip().lower()
        if continue_chat != "yes":
            break

    print("Thank you for using the AI AHAAY Meal Planner Chatbot!")

if __name__ == "__main__":
    main()


ValueError: Unrecognized model in ./results. Should have a `model_type` key in its config.json, or contain one of the following strings in its name: albert, align, altclip, aria, aria_text, audio-spectrogram-transformer, autoformer, bamba, bark, bart, beit, bert, bert-generation, big_bird, bigbird_pegasus, biogpt, bit, blenderbot, blenderbot-small, blip, blip-2, bloom, bridgetower, bros, camembert, canine, chameleon, chinese_clip, chinese_clip_vision_model, clap, clip, clip_text_model, clip_vision_model, clipseg, clvp, code_llama, codegen, cohere, cohere2, colpali, conditional_detr, convbert, convnext, convnextv2, cpmant, ctrl, cvt, dab-detr, dac, data2vec-audio, data2vec-text, data2vec-vision, dbrx, deberta, deberta-v2, decision_transformer, deformable_detr, deit, depth_anything, depth_pro, deta, detr, diffllama, dinat, dinov2, dinov2_with_registers, distilbert, donut-swin, dpr, dpt, efficientformer, efficientnet, electra, emu3, encodec, encoder-decoder, ernie, ernie_m, esm, falcon, falcon_mamba, fastspeech2_conformer, flaubert, flava, fnet, focalnet, fsmt, funnel, fuyu, gemma, gemma2, git, glm, glpn, got_ocr2, gpt-sw3, gpt2, gpt_bigcode, gpt_neo, gpt_neox, gpt_neox_japanese, gptj, gptsan-japanese, granite, granitemoe, granitemoeshared, granitevision, graphormer, grounding-dino, groupvit, helium, hiera, hubert, ibert, idefics, idefics2, idefics3, idefics3_vision, ijepa, imagegpt, informer, instructblip, instructblipvideo, jamba, jetmoe, jukebox, kosmos-2, layoutlm, layoutlmv2, layoutlmv3, led, levit, lilt, llama, llava, llava_next, llava_next_video, llava_onevision, longformer, longt5, luke, lxmert, m2m_100, mamba, mamba2, marian, markuplm, mask2former, maskformer, maskformer-swin, mbart, mctct, mega, megatron-bert, mgp-str, mimi, mistral, mixtral, mllama, mobilebert, mobilenet_v1, mobilenet_v2, mobilevit, mobilevitv2, modernbert, moonshine, moshi, mpnet, mpt, mra, mt5, musicgen, musicgen_melody, mvp, nat, nemotron, nezha, nllb-moe, nougat, nystromformer, olmo, olmo2, olmoe, omdet-turbo, oneformer, open-llama, openai-gpt, opt, owlv2, owlvit, paligemma, patchtsmixer, patchtst, pegasus, pegasus_x, perceiver, persimmon, phi, phi3, phimoe, pix2struct, pixtral, plbart, poolformer, pop2piano, prophetnet, pvt, pvt_v2, qdqbert, qwen2, qwen2_5_vl, qwen2_audio, qwen2_audio_encoder, qwen2_moe, qwen2_vl, rag, realm, recurrent_gemma, reformer, regnet, rembert, resnet, retribert, roberta, roberta-prelayernorm, roc_bert, roformer, rt_detr, rt_detr_resnet, rt_detr_v2, rwkv, sam, seamless_m4t, seamless_m4t_v2, segformer, seggpt, sew, sew-d, siglip, siglip_vision_model, speech-encoder-decoder, speech_to_text, speech_to_text_2, speecht5, splinter, squeezebert, stablelm, starcoder2, superglue, superpoint, swiftformer, swin, swin2sr, swinv2, switch_transformers, t5, table-transformer, tapas, textnet, time_series_transformer, timesformer, timm_backbone, timm_wrapper, trajectory_transformer, transfo-xl, trocr, tvlt, tvp, udop, umt5, unispeech, unispeech-sat, univnet, upernet, van, video_llava, videomae, vilt, vipllava, vision-encoder-decoder, vision-text-dual-encoder, visual_bert, vit, vit_hybrid, vit_mae, vit_msn, vitdet, vitmatte, vitpose, vitpose_backbone, vits, vivit, wav2vec2, wav2vec2-bert, wav2vec2-conformer, wavlm, whisper, xclip, xglm, xlm, xlm-prophetnet, xlm-roberta, xlm-roberta-xl, xlnet, xmod, yolos, yoso, zamba, zamba2, zoedepth