<a href="https://colab.research.google.com/github/azzahangely/cultour-app/blob/main/chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
dataset = "https://raw.githubusercontent.com/azzahangely/cultour-app/refs/heads/main/cultural_ins.csv"
data = pd.read_csv(dataset)

In [None]:
print(data.columns)

Index(['country', 'province_state_region', 'cultural_norm_etiquette',
       'language', 'major_ed', 'major_e_name', 'food_culture',
       'important_sites', 'religious_practices', 'taboos'],
      dtype='object')


In [None]:
print(data.isnull().sum())

country                     0
province_state_region       0
cultural_norm_etiquette     0
language                    0
major_ed                    0
major_e_name                0
food_culture                0
important_sites             0
religious_practices         4
taboos                     13
dtype: int64


In [None]:
unique_countries = data['country'].unique()
unique_provinces = data['province_state_region'].unique()

print("Unique countries:", unique_countries)
print("Unique provinces:", unique_provinces)

Unique countries: ['Timor-Leste' 'Laos' 'Myanmar' 'Philippines']
Unique provinces: ['Aileu' 'Ainaro' 'Baucau' 'Bobonaro' 'Covalima' 'Dili' 'Ermera' 'Lautem'
 'Liquiçá' 'Manatuto' 'Manufahi' 'Oecusse' 'Viqueque' 'Atauro' 'Attapeu'
 'Bokeo' 'Bolikhamsai' 'Champasak' 'Houaphanh' 'Khammouane' 'Luang Namtha'
 'Luang Prabang' 'Oudomxay' 'Phongsaly' 'Savannakhet' 'Salavan' 'Sepon'
 'Vientiane' 'Xaisomboun' 'Xayaboury' 'Xieng Khouang' 'Ayeyarwady Region'
 'Bago Region' 'Chin State' 'Kachin State' 'Kayin State' 'Kayah State'
 'Magway Region' 'Mandalay Region' 'Mon State' 'Rakhine State'
 'Sagaing Region' 'Tanintharyi Region' 'Yangon Region' 'Shan State' 'Abra'
 'Agusan del Norte' 'Agusan del Sur' 'Aklan' 'Albay' 'Antique' 'Apayao'
 'Aurora' 'Basilan' 'Bataan' 'Batanes' 'Batangas' 'Benguet' 'Biliran'
 'Bohol' 'Bukidnon' 'Bulacan' 'Cagayan' 'Camarines Norte' 'Camarines Sur'
 'Camiguin' 'Capiz' 'Cavite' 'Cebu' 'Compostela Valley (Davao de Oro)'
 'Cotabato' 'Davao del Norte' 'Davao del Sur' 'Davao 

In [None]:
duplicates = data[data.duplicated()]
print("Duplicate rows:")
print(duplicates)
data = data.drop_duplicates()


Duplicate rows:
Empty DataFrame
Columns: [country, province_state_region, cultural_norm_etiquette, language, major_ed, major_e_name, food_culture, important_sites, religious_practices, taboos]
Index: []


In [None]:
data['language'] = data['language'].str.title()

In [None]:
print(data['major_ed'].head())

0    20/08/2024
1    01/05/2024
2    15/11/2024
3    10/07/2024
4    25/12/2024
Name: major_ed, dtype: object


In [None]:
print(data['major_ed'].unique())

['20/08/2024' '01/05/2024' '15/11/2024' '10/07/2024' '25/12/2024'
 '20/05/2024' '01/10/2024' '18/04/2024' '30/06/2024' '05/03/2024'
 '21/09/2024' '15/08/2024' '10/06/2024' '25/09/2024' '01/12/2024'
 '20/10/2024' '13/04/2024' '02/12/2024' '25/11/2024' '05/12/2024'
 '20/11/2024' '15/12/2024' '10/12/2024' '17/02/2024' '20/02/2024'
 '10/01/2024' '31/12/2024' '01/01/2024' 'Thingyan Water Festival'
 '09/03/2024' '02/08/2024' '17/06/2024' '15/01/2024' '24/06/2024'
 '14/02/2024' '19/02/2024' '07/03/2024' '09/04/2024' '26/06/2024'
 '23/02/2024' '11/05/2024' '01/07/2024' '10/03/2024' '15/09/2024'
 '25/08/2024' '22/04/2024' '18/09/2024' '24/10/2024' '16/01/2024'
 '23/03/2024' '21/01/2024' '08/06/2024' '31/03/2024' '10/09/2024'
 '27/10/2024' '17/07/2024' '15/05/2024' '22/05/2024' '28/04/2024'
 '07/12/2024' '24/01/2024' '02/05/2024' '02/02/2024' '19/09/2024'
 '14/07/2024' '29/06/2024' '09/12/2024' '29/03/2024' '15/06/2024'
 '09/01/2024' '27/03/2024' '01/09/2024' '25/04/2024' '19/04/2024'
 '28/03/20

In [None]:

data['major_ed_normalized'] = pd.to_datetime(data['major_ed'], errors='coerce', dayfirst=True)
data['major_ed_normalized'] = data['major_ed_normalized'].dt.strftime('%d/%m/%Y')
data['major_ed_normalized'] = data['major_ed_normalized'].fillna('Not Available')

print(data)


         country province_state_region  \
0    Timor-Leste                 Aileu   
1    Timor-Leste                Ainaro   
2    Timor-Leste                Baucau   
3    Timor-Leste              Bobonaro   
4    Timor-Leste              Covalima   
..           ...                   ...   
117  Philippines             Tawi-Tawi   
118  Philippines              Zambales   
119  Philippines   Zamboanga del Norte   
120  Philippines     Zamboanga del Sur   
121  Philippines     Zamboanga Sibugay   

                               cultural_norm_etiquette            language  \
0         Deep respect for elders and local traditions       Tetum, Mambai   
1                  Respect for nature and sacred sites       Tetum, Mambai   
2                             Emphasis on family bonds      Tetum, Makasae   
3                             Hospitality to strangers        Tetum, Kemak   
4           Respect for elders and traditional leaders        Tetum, Bunak   
..                         

In [None]:
data.to_csv('sea_cultural_insight.csv', index=False)

print("Cleaned dataset saved as sea_cultural_insight.csv'")

Cleaned dataset saved as sea_cultural_insight.csv'


In [None]:
data.fillna("Not Available", inplace=True)

In [None]:
print(data.describe(include='all'))

            country province_state_region  \
count           122                   122   
unique            4                   122   
top     Philippines                 Aileu   
freq             77                     1   

                              cultural_norm_etiquette language    major_ed  \
count                                             122      122         122   
unique                                            113       84          89   
top     Hospitality and respect for cultural heritage  Tagalog  13/04/2024   
freq                                                3        7          10   

                   major_e_name  \
count                       122   
unique                      112   
top     Thingyan Water Festival   
freq                          6   

                                           food_culture      important_sites  \
count                                               122                  122   
unique                                         

In [None]:
dataset_fix = "https://raw.githubusercontent.com/azzahangely/cultour-app/refs/heads/main/sea_cultural_insight.csv"
data = pd.read_csv(dataset_fix)

In [None]:
data['tags'] = data.apply(lambda row: ', '.join(filter(None, [
    'etiquette' if pd.notnull(row['cultural_norm_etiquette']) else None,
    'language' if pd.notnull(row['language']) else None,
    'cuisine' if pd.notnull(row['food_culture']) else None,
    'religion' if pd.notnull(row['religious_practices']) else None,
    'tourist sites' if pd.notnull(row['important_sites']) else None
])), axis=1)

In [None]:
data['relevance_score'] = data[['cultural_norm_etiquette', 'language', 'food_culture']].notnull().sum(axis=1) / 3.0
data['region_type'] = data['province_state_region'].apply(lambda x: 'province' if pd.notnull(x) else 'country')
data['use_case'] = 'tourism'

In [None]:
metadata_file = 'metadata.csv'
data.to_csv(metadata_file, index=False)

In [None]:
metadata = pd.read_csv(metadata_file)

missing_values = metadata.isnull().sum()
print("Missing values in each column:")
print(missing_values)

duplicates = metadata[metadata.duplicated()]
print("Duplicate rows:")
print(duplicates)

try:
    pd.to_datetime(metadata['major_ed_normalized'], format='%d/%m/%Y')
    print("All dates are in the correct format.")
except Exception as e:
    print(f"Date format issues: {e}")

print("Unique tags:", metadata['tags'].unique())
print("Unique use cases:", metadata['use_case'].unique())


Missing values in each column:
country                     0
province_state_region       0
cultural_norm_etiquette     0
language                    0
major_ed                    0
major_e_name                0
food_culture                0
important_sites             0
religious_practices         4
taboos                     13
major_ed_normalized         0
tags                        0
relevance_score             0
region_type                 0
use_case                    0
dtype: int64
Duplicate rows:
Empty DataFrame
Columns: [country, province_state_region, cultural_norm_etiquette, language, major_ed, major_e_name, food_culture, important_sites, religious_practices, taboos, major_ed_normalized, tags, relevance_score, region_type, use_case]
Index: []
Date format issues: time data "Not Available" doesn't match format "%d/%m/%Y", at position 37. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are 

In [None]:
# query testing


def simulate_query(country=None, tag=None):
    results = metadata
    if country:
        results = results[results['country'].str.contains(country, case=False, na=False)]
    if tag:
        results = results[results['tags'].str.contains(tag, case=False, na=False)]
    return results[['country', 'province_state_region', 'tags', 'cultural_norm_etiquette']]

print("Query: 'Thailand + etiquette'")
print(simulate_query(country='Thailand', tag='etiquette'))

print("\nQuery: 'Timor-Leste + cuisine'")
print(simulate_query(country='Timor-Leste', tag='cuisine'))


Query: 'Thailand + etiquette'
Empty DataFrame
Columns: [country, province_state_region, tags, cultural_norm_etiquette]
Index: []

Query: 'Timor-Leste + cuisine'
        country province_state_region  \
0   Timor-Leste                 Aileu   
1   Timor-Leste                Ainaro   
2   Timor-Leste                Baucau   
3   Timor-Leste              Bobonaro   
4   Timor-Leste              Covalima   
5   Timor-Leste                  Dili   
6   Timor-Leste                Ermera   
7   Timor-Leste                Lautem   
8   Timor-Leste               Liquiçá   
9   Timor-Leste              Manatuto   
10  Timor-Leste              Manufahi   
11  Timor-Leste               Oecusse   
12  Timor-Leste              Viqueque   
13  Timor-Leste                Atauro   

                                                 tags  \
0   etiquette, language, cuisine, religion, touris...   
1   etiquette, language, cuisine, religion, touris...   
2   etiquette, language, cuisine, religion, touris..

In [None]:
low_relevance_rows = metadata[metadata['relevance_score'] < 0.9]
print("Rows with low relevance scores:")
print(low_relevance_rows)


Rows with low relevance scores:
Empty DataFrame
Columns: [country, province_state_region, cultural_norm_etiquette, language, major_ed, major_e_name, food_culture, important_sites, religious_practices, taboos, major_ed_normalized, tags, relevance_score, region_type, use_case]
Index: []


# batas suci

In [None]:
!pip install transformers tensorflow



In [None]:
from transformers import TFAutoModel, AutoTokenizer

# Use a TensorFlow-compatible model
model_name = "sentence-transformers/distiluse-base-multilingual-cased-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tf_model = TFAutoModel.from_pretrained(model_name)

print("Model and tokenizer successfully loaded!")


tokenizer_config.json:   0%|          | 0.00/531 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/610 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/539M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFDistilBertModel.

All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Model and tokenizer successfully loaded!


In [None]:
metadata['embedding_text'] = metadata.apply(
    lambda row: f"Country: {row['country']}. Province: {row['province_state_region']}. "
                f"Etiquette: {row['cultural_norm_etiquette']}. Food: {row['food_culture']}. "
                f"Sites: {row['important_sites']}.", axis=1
)


In [None]:
metadata['embedding_text'] = metadata.apply(
    lambda row: f"Country: {row['country']}. Province: {row['province_state_region']}. "
                f"Etiquette: {row['cultural_norm_etiquette']}. Food: {row['food_culture']}. "
                f"Sites: {row['important_sites']}.", axis=1
)

print(metadata['embedding_text'].head())


0    Country: Timor-Leste. Province: Aileu. Etiquet...
1    Country: Timor-Leste. Province: Ainaro. Etique...
2    Country: Timor-Leste. Province: Baucau. Etique...
3    Country: Timor-Leste. Province: Bobonaro. Etiq...
4    Country: Timor-Leste. Province: Covalima. Etiq...
Name: embedding_text, dtype: object


In [None]:
def generate_embeddings(sentences):
    inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="tf")
    outputs = tf_model(**inputs)
    cls_embeddings = outputs.last_hidden_state[:, 0, :]
    return cls_embeddings.numpy()

sentences = metadata['embedding_text'].tolist()
batch_size = 32
embeddings = []

for i in range(0, len(sentences), batch_size):
    batch_sentences = sentences[i:i+batch_size]
    batch_embeddings = generate_embeddings(batch_sentences)
    embeddings.extend(batch_embeddings)

embeddings = np.array(embeddings)
print("Embeddings shape:", embeddings.shape)

metadata['embeddings'] = embeddings.tolist()


Embeddings shape: (122, 768)


In [None]:
metadata.to_csv("data_metadata_tf.csv", index=False)
print("Dataset with embeddings saved as 'data_metadata_tf.csv'.")


Dataset with embeddings saved as 'data_metadata_tf.csv'.


In [None]:
!pip install pinecone-client


Collecting pinecone-client
  Downloading pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)
Collecting pinecone-plugin-inference<2.0.0,>=1.0.3 (from pinecone-client)
  Downloading pinecone_plugin_inference-1.1.0-py3-none-any.whl.metadata (2.2 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone-client)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Downloading pinecone_client-5.0.1-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.8/244.8 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_inference-1.1.0-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.4/85.4 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_interface-0.0.7-py3-none-any.whl (6.2 kB)
Installing collected packages: pinecone-plugin-interface, pinecone-plugin-inference, pinecone-client
Successfully installed pinecone-client-5.0.

In [None]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(
    api_key="pcsk_5dUkk6_C9U9wDrh339aAZwzekpGVKHHGU2Zsq4FGSPCvo5FWhShFJ67oQg5yxPF4WgvmfH"
)

print("Pinecone initialized successfully!")


Pinecone initialized successfully!


In [None]:
# creating index from code (not console)

index_name = "cultural-insights"
embedding_dim = 768

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=embedding_dim,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )
    print(f"Index '{index_name}' created successfully!")

# Connect to the index
index = pc.Index(index_name)
print(f"Connected to Pinecone index: {index_name}")

Connected to Pinecone index: cultural-insights


In [None]:
data_insert = [
    (
        str(i),
        embeddings[i].tolist(),
        {
            "country": metadata.iloc[i]["country"],
            "tags": metadata.iloc[i]["tags"]
        }
    )
    for i in range(len(metadata))
]

index.upsert(data_insert)
print("Embeddings and metadata uploaded to Pinecone!")

Embeddings and metadata uploaded to Pinecone!


In [None]:
# Generate embedding for the query
user_query = "What is the etiquette in Timor-Leste?"
query_embedding = generate_embeddings([user_query])[0]  # Generate embedding for query

# Perform the query
results = index.query(
    vector=query_embedding.tolist(),
    top_k=3,  # Number of top results to return
    include_metadata=True  # Include metadata in results
)

# Display the results
for match in results["matches"]:
    print(f"Score: {match['score']}, Metadata: {match['metadata']}")


Score: 0.348427534, Metadata: {'country': 'Timor-Leste', 'tags': 'etiquette, language, cuisine, religion, tourist sites'}
Score: 0.328592062, Metadata: {'country': 'Timor-Leste', 'tags': 'etiquette, language, cuisine, religion, tourist sites'}
Score: 0.319233775, Metadata: {'country': 'Timor-Leste', 'tags': 'etiquette, language, cuisine, religion, tourist sites'}


In [None]:
stats = index.describe_index_stats()
print(stats)


{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 122}},
 'total_vector_count': 122}


## integrate index

In [None]:
def query_pinecone(user_query, model, tokenizer, index, top_k=3):
    """
    Query Pinecone index to retrieve the most relevant results for a user query.

    Args:
    - user_query: The user's input text.
    - model: The embedding model.
    - tokenizer: The tokenizer for the embedding model.
    - index: Pinecone index object.
    - top_k: Number of top results to retrieve.

    Returns:
    - A list of dictionaries containing scores and metadata for the results.
    """
    inputs = tokenizer(user_query, padding=True, truncation=True, return_tensors="tf")
    outputs = model(**inputs)
    query_embedding = outputs.last_hidden_state[:, 0, :].numpy()[0]  # Use the CLS token's embedding

    results = index.query(
        vector=query_embedding.tolist(),
        top_k=top_k,
        include_metadata=True
    )

    contexts = [
        {
            "score": match["score"],
            "metadata": match["metadata"]
        }
        for match in results["matches"]
    ]
    return contexts

user_query = "What is the etiquette in Timor-Leste?"
contexts = query_pinecone(user_query, tf_model, tokenizer, index)
print("Retrieved Contexts:", contexts)


Retrieved Contexts: [{'score': 0.348427534, 'metadata': {'country': 'Timor-Leste', 'tags': 'etiquette, language, cuisine, religion, tourist sites'}}, {'score': 0.328592062, 'metadata': {'country': 'Timor-Leste', 'tags': 'etiquette, language, cuisine, religion, tourist sites'}}, {'score': 0.319233775, 'metadata': {'country': 'Timor-Leste', 'tags': 'etiquette, language, cuisine, religion, tourist sites'}}]


In [None]:
def process_results(contexts):
    """
    Process Pinecone results for chatbot integration.

    Args:
    - contexts: List of retrieved contexts from Pinecone.

    Returns:
    - A formatted string containing context information.
    """
    context_texts = []
    for ctx in contexts:
        metadata = ctx['metadata']
        context_texts.append(f"Country: {metadata['country']}, Tags: {metadata['tags']}")

    return "\n".join(context_texts)

context_text = process_results(contexts)
print("Formatted Context:\n", context_text)


Formatted Context:
 Country: Timor-Leste, Tags: etiquette, language, cuisine, religion, tourist sites
Country: Timor-Leste, Tags: etiquette, language, cuisine, religion, tourist sites
Country: Timor-Leste, Tags: etiquette, language, cuisine, religion, tourist sites


### BELOM JADI



In [None]:


def generate_response(user_query, contexts):
    context_text = "\n".join([
        f"Metadata: {ctx['metadata']}" for ctx in contexts
    ])
    prompt = f"""
    You are a cultural assistant chatbot. Answer the user's question based on the context below:

    Context:
    {context_text}

    Question:
    {user_query}
    """

    response = llm(prompt)

response = generate_response(user_query, contexts)
print("Response:", response)
