In [1]:
import json
from elasticsearch import Elasticsearch
from tqdm.auto import tqdm
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "company": {"type": "text"},
            "name": {"type": "text"},
            "specifications": {"type": "keyword"} ,
            "name_vector": {"type": "dense_vector", "dims": 768, "index": True, "similarity": "cosine"},
            "specification_vector": {"type": "dense_vector", "dims": 768, "index": True, "similarity": "cosine"}
        }
    }
}

In [3]:
es_client = Elasticsearch('http://localhost:9200')

In [4]:
index_name = "mobile-specifications"
es_client.indices.delete(index=index_name)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'mobile-specifications'})

In [5]:
import pickle

In [6]:
with open('../search_build_and_eval/mobile_specs_vector_data.pkl', 'rb') as fp:
    mobile_specs_vector_data = pickle.load(fp)

In [8]:
len(mobile_specs_vector_data)

2096

In [7]:
with open('../data-extraction/mobile_specifications_data.json', 'r') as fp:
    mobile_specs_data = json.load(fp)

In [10]:
len(mobile_specs_data)

179

In [27]:
for each in mobile_specs_data['10-or-phones']:
    print(each)

10.or-g2
10.or-d2
10.or-d
10.or-g
10.or-e


In [8]:
ram_combination_phones = {
    'phone_with_one_gb': [],
    'phone_with_two_gb': [],
    'phone_with_four_gb': [],
    'phone_with_six_gb': [],
    'phone_with_eight_gb': [],
    'phone_with_twelve_gb': [],
    'phone_with_sixteen_gb': [],
    'phone_with_three_gb': [],
    'phone_with_eighteen_gb': [],
    'phone_with_thirty_two_gb': [],
}

In [9]:
phones_with_large_ram = []
phones_with_no_ram = []

In [10]:
for each_mobile_category in mobile_specs_data:
    for each_mobile in mobile_specs_data[each_mobile_category]:
        mobile = mobile_specs_data[each_mobile_category][each_mobile]
        if 'Hardware' not in mobile:
            temp_dict = {
                'mobile_company': each_mobile_category,
                'mobile': each_mobile
            }
            phones_with_no_ram.append(temp_dict)
        elif 'RAM' not in mobile['Hardware']:
            temp_dict = {
                'mobile_company': each_mobile_category,
                'mobile': each_mobile
            }
            phones_with_no_ram.append(temp_dict)
        elif '1' in mobile['Hardware']['RAM']:
            ram_combination_phones['phone_with_one_gb'].append(each_mobile)
        elif '2' in mobile['Hardware']['RAM']:
            ram_combination_phones['phone_with_two_gb'].append(each_mobile)
        elif '3' in mobile['Hardware']['RAM']:
            ram_combination_phones['phone_with_three_gb'].append(each_mobile)
        elif '4' in mobile['Hardware']['RAM']:
            ram_combination_phones['phone_with_four_gb'].append(each_mobile)
        elif '6' in mobile['Hardware']['RAM']:
            ram_combination_phones['phone_with_six_gb'].append(each_mobile)
        elif '8' in mobile['Hardware']['RAM']:
            ram_combination_phones['phone_with_eight_gb'].append(each_mobile)
        elif '12' in mobile['Hardware']['RAM']:
            ram_combination_phones['phone_with_twelve_gb'].append(each_mobile)
        elif '16' in mobile['Hardware']['RAM']:
            ram_combination_phones['phone_with_sixteen_gb'].append(each_mobile)
        elif '18' in mobile['Hardware']['RAM']:
            ram_combination_phones['phone_with_eighteen_gb'].append(each_mobile)
        elif '32' in mobile['Hardware']['RAM']:
            ram_combination_phones['phone_with_thirty_two_gb'].append(each_mobile)
        else:
            temp_dict = {
                'mobile_company': each_mobile_category,
                'mobile': each_mobile
            }
            phones_with_large_ram.append(temp_dict)

In [13]:
len(phones_with_no_ram)

104

In [11]:
len(phones_with_large_ram)

0

In [53]:
mobile_specs_vector_data[0].keys()

dict_keys(['company', 'name', 'specifications', 'name_vector', 'specification_vector'])

In [15]:
import pandas as pd

In [16]:
mobile_df = pd.DataFrame(mobile_specs_vector_data)

In [17]:
json.loads(mobile_df.iloc[0].specifications)

{'General': {'Brand': '10.or',
  'Model': 'G2',
  'Price in India': '₹7,999',
  'Release date': '28th June 2019',
  'Launched in India': 'Yes',
  'Form factor': 'Touchscreen',
  'Battery capacity (mAh)': '5000',
  'Fast charging': 'Proprietary',
  'Colours': 'Charcoal Black, Twilight Blue'},
 'Display': {'Screen size (inches)': '6.18',
  'Touchscreen': 'Yes',
  'Resolution': '1080x2246 pixels',
  'Protection type': 'Gorilla Glass',
  'Aspect ratio': '19:9'},
 'Hardware': {'Processor': 'octa-core',
  'Processor make': 'Qualcomm Snapdragon 636',
  'RAM': '4GB',
  'Internal storage': '64GB',
  'Expandable storage': 'Yes',
  'Expandable storage type': 'microSD',
  'Expandable storage up to (GB)': '256',
  'Dedicated microSD slot': 'Yes'},
 'Camera': {'Rear camera': '16-megapixel + 5-megapixel',
  'Rear autofocus': 'Yes',
  'Rear flash': 'Dual LED',
  'Front camera': '12-megapixel'},
 'Software': {'Operating system': 'Android'},
 'Connectivity': {'Wi-Fi': 'Yes',
  'Wi-Fi standards supported

In [106]:
mobile_df

Unnamed: 0,company,name,specifications,name_vector,specification_vector
0,10-or-phones,10.or-g2,"{""General"": {""Brand"": ""10.or"", ""Model"": ""G2"", ...","[-0.012813553214073181, -0.0776914730668068, 0...","[0.04288917034864426, -0.021033886820077896, -..."
1,10-or-phones,10.or-d2,"{""General"": {""Brand"": ""10.or"", ""Model"": ""D2"", ...","[-0.031122317537665367, -0.07828520238399506, ...","[0.02611490711569786, -0.027373576536774635, -..."
2,10-or-phones,10.or-d,"{""General"": {""Brand"": ""10.or"", ""Model"": ""D"", ""...","[-0.009422365576028824, -0.02212154120206833, ...","[0.017275657504796982, -0.049797121435403824, ..."
3,10-or-phones,10.or-g,"{""General"": {""Brand"": ""10.or"", ""Model"": ""G"", ""...","[-0.006688821595162153, -0.07486896216869354, ...","[0.029656710103154182, -0.052758242934942245, ..."
4,10-or-phones,10.or-e,"{""General"": {""Brand"": ""10.or"", ""Model"": ""E"", ""...","[-0.01733524352312088, -0.06605043262243271, -...","[0.022073449566960335, -0.05600305646657944, -..."
...,...,...,...,...,...
2091,zte-phones,zte-blade-v9-vita,"{""General"": {""Brand"": ""ZTE"", ""Model"": ""Blade V...","[0.008986365981400013, -0.07202902436256409, 0...","[0.027861520648002625, -0.0832376480102539, -0..."
2092,zte-phones,zte-tempo-go,"{""General"": {""Brand"": ""ZTE"", ""Model"": ""Tempo G...","[-0.005515120457857847, -0.06609780341386795, ...","[0.017104269936680794, -0.039781711995601654, ..."
2093,zte-phones,zte-blade-v9,"{""General"": {""Brand"": ""ZTE"", ""Model"": ""Blade V...","[-0.013905730098485947, -0.09210636466741562, ...","[0.021680796518921852, -0.08901219069957733, -..."
2094,zte-phones,zte-blade-a3,"{""General"": {""Brand"": ""ZTE"", ""Model"": ""Blade A...","[0.01376366801559925, -0.13015377521514893, 0....","[0.03441234678030014, -0.10361833870410919, -0..."


In [18]:
phones_with_16mp_camera_values = []
phones_with_48mp_camera_values = []

In [19]:
def get_all_camera_specs(row):
    row_dict = json.loads(row['specifications'])
    if 'Camera' in row_dict and 'Rear camera' in row_dict['Camera']:
        if '16' in row_dict['Camera']['Rear camera']:
            phones_with_16mp_camera_values.append(row['name'])
        elif '48' in row_dict['Camera']['Rear camera']:
            phones_with_48mp_camera_values.append(row['name'])

In [20]:
mobile_df.apply(lambda row: get_all_camera_specs(row), axis=1)

0       None
1       None
2       None
3       None
4       None
        ... 
2091    None
2092    None
2093    None
2094    None
2095    None
Length: 2096, dtype: object

In [21]:
phones_with_16mp_camera_values = list(set(phones_with_16mp_camera_values))
len(phones_with_16mp_camera_values)

111

In [117]:
phones_with_48mp_camera_values = list(set(phones_with_48mp_camera_values))
len(phones_with_48mp_camera_values)

114

In [118]:
phones_with_16mp_camera_values[:5]

['nubia-z18',
 'gionee-s8',
 'panasonic-eluga-ray-810',
 'panasonic-eluga-ray-max',
 'coolpad-cool-play-8']

In [119]:
phones_with_48mp_camera_values[:5]

['google-pixel-9-pro',
 'google-pixel-6-pro',
 'poco-x3-pro',
 'meizu-16s',
 'zte-axon-11-se']

In [86]:
len(camera_values)

3255

In [87]:
unique_camera_values = set(camera_values)

In [88]:
len(unique_camera_values)

516

In [90]:
list(unique_camera_values)[:10]

['8-megapixel + VGA + VGA',
 '8-megapixel (f/2.2, 1.5-micron)',
 '12-megapixel (f/2.0) + 12-megapixel (f/2.4) + 12-megapixel (f/2.2)',
 '12.2-megapixel (f/1.8, 1.4-micron)',
 '50-megapixel + 48-megapixel',
 '12-megapixel (f/1.6, 1.4-micron) + 12-megapixel (f/2.4, 1-micron) + 12-megapixel (f/2.4, 1-micron)',
 '50-megapixel + 12-megapixel',
 '50-megapixel + 12-megapixel + 12-megapixel',
 '13-megapixel (f/1.8) + 2-megapixel',
 '13-megapixel (f/2.2, 1.12-micron) + 5-megapixel (f/2.2, 1.12-micron) + 2-megapixel (f/2.4, 1.75-micron)']

In [91]:
os_values = []

In [94]:
def get_all_os_specs(row):
    row_dict = json.loads(row)
    if 'Software' in row_dict and 'Operating system' in row_dict['Software']:
        os_values.append(row_dict['Software']['Operating system'])

#### Above code gives me all os values and not phone names

In [22]:
phones_with_android_values = []
phones_with_ios_values = []

In [23]:
def get_all_os_specs(row):
    row_dict = json.loads(row['specifications'])
    if 'Software' in row_dict and 'Operating system' in row_dict['Software']:
        if 'Android' in row_dict['Software']['Operating system']:
            phones_with_android_values.append(row['name'])
        elif 'iOS' in row_dict['Software']['Operating system']:
            phones_with_ios_values.append(row['name'])

In [25]:
mobile_df.apply(lambda row: get_all_os_specs(row), axis=1)

0       None
1       None
2       None
3       None
4       None
        ... 
2091    None
2092    None
2093    None
2094    None
2095    None
Length: 2096, dtype: object

In [96]:
os_values = list(set(os_values))

In [97]:
len(os_values)

118

In [98]:
os_values[:10]

['HarmonyOS HarmonyOS 4.0',
 'Android 4.0',
 'Android Go Edition',
 'HarmonyOS 4.2',
 'Firefox OS 1.4',
 'Android 4.2.2',
 'Android 9 pIE',
 'Android 4.1',
 'Android 5.1.1',
 'Android 13 (Go Edition)']

In [26]:
phones_with_android_values = list(set(phones_with_android_values))
len(phones_with_android_values)

1861

In [27]:
phones_with_android_values[:5]

['asus-zenfone-max-m1-(zb556kl)',
 'oneplus-8-pro',
 'xolo-opus-hd',
 'panasonic-p101',
 'lg-k41s']

In [28]:
phones_with_ios_values = list(set(phones_with_ios_values))
len(phones_with_ios_values)

47

In [31]:
phones_with_ios_values[:5]

['iphone-14',
 'iphone-14-pro',
 'iphone-14-pro-max',
 'iphone-14-plus',
 'apple-iphone-x']

In [32]:
processors = []

In [100]:
def get_all_processors_specs(row):
    row_dict = json.loads(row)
    if 'Hardware' in row_dict and 'Processor make' in row_dict['Hardware']:
        processors.append(row_dict['Hardware']['Processor make'])

In [101]:
mobile_df['specifications'].apply(lambda row: get_all_processors_specs(row))

0       None
1       None
2       None
3       None
4       None
        ... 
2091    None
2092    None
2093    None
2094    None
2095    None
Name: specifications, Length: 2096, dtype: object

In [33]:
phones_with_mediatek_values = []
phones_with_snapdragon_values = []

In [34]:
def get_all_processor_specs(row):
    row_dict = json.loads(row['specifications'])
    if 'Hardware' in row_dict and 'Processor make' in row_dict['Hardware']:
        if 'MediaTek' in row_dict['Hardware']['Processor make']:
            phones_with_mediatek_values.append(row['name'])
        elif 'Snapdragon' in row_dict['Hardware']['Processor make']:
            phones_with_snapdragon_values.append(row['name'])

In [35]:
mobile_df.apply(lambda row: get_all_processor_specs(row), axis=1)

0       None
1       None
2       None
3       None
4       None
        ... 
2091    None
2092    None
2093    None
2094    None
2095    None
Length: 2096, dtype: object

In [36]:
phones_with_mediatek_values = list(set(phones_with_mediatek_values))
len(phones_with_mediatek_values)

675

In [37]:
phones_with_mediatek_values[:5]

['poco-c51',
 'panasonic-p90',
 'xolo-era-2x',
 'micromax-bharat-go',
 'intex-aqua-note-5.5']

In [143]:
phones_with_snapdragon_values = list(set(phones_with_snapdragon_values))
len(phones_with_snapdragon_values)

691

In [38]:
phones_with_snapdragon_values[:5]

['10.or-g2', '10.or-d2', '10.or-d', '10.or-g', '10.or-e']

In [102]:
processors = list(set(processors))

In [103]:
len(processors)

284

In [104]:
processors[:10]

['',
 'MediaTek Helio P10 (MT6755)',
 'MediaTek Helio P10 M (MT6755M)',
 'MediaTek MT6737',
 'MediaTek Helio P70',
 'Qualcomm Snapdragon 6s Gen 3',
 'Qualcomm Snapdragon 778G Plus',
 'Samsung Exynos 850',
 'Qualcomm',
 'Qualcomm Snapdragon 400 (MSM8926)']

In [39]:
from huggingface_hub import login

In [40]:
import os

In [41]:
login(token=os.environ['HF_API_TOKEN'])

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /Users/amoghkulkarni/.cache/huggingface/token
Login successful


In [52]:
from openai import OpenAI

In [71]:
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://localhost:11434/v1/")

In [72]:
ollama_client = OpenAI(base_url=OLLAMA_URL, api_key="ollama")

In [68]:
model = SentenceTransformer("all-mpnet-base-v2")



In [54]:
def elastic_search_knn(vector, vector_field="name_vector"):
    """Return elastic search results for given vector."""
    knn_query = {
        "field": vector_field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000
    }

    search_query = {
        'knn': knn_query,
        '_source': ["name", "company", "specifications"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )

    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [55]:
def get_mobile_name_results(mobile_name_statement):
    """Return the ES results of the mobile phone asked by user."""
    input_vector = model.encode(mobile_name_statement)
    mobile_context = elastic_search_knn(input_vector, 'specification_vector')

    return mobile_context

In [56]:
def build_prompt(phone_specifications, mobile_context):
    """Build the context using search results and return the prompt."""
    prompt_template = """
You are mobile phone expert. Answer all the specifications of PHONE using the CONTEXT provided from SPECFICATIONS database.
The specifications in the CONTEXT is a json string. Convert the json string to readable format before printing out the answer.
Use only the specifications given in the context to print the specs of the device.
Do not number the specifications passed.
Print only the first result in a markdown format where subsections are indented in readable format.


PHONE: {phone}

CONTEXT:
{context}
""".strip()

    context = ""

    for each in mobile_context:
        context = context + f"\n\nname: {each['name']}\ncompany: {each['company']}\nspecifications: {each['specifications']}"

    prompt = prompt_template.format(phone=phone_specifications, context=context)
    return prompt

In [62]:
def llm(model, client, prompt):
    """Return LLM generation based on context."""
    if model.startswith('ollama'):
        response = ollama_client.chat.completions.create(
            model='phi3',
            messages=[{"role": "user", "content": prompt}]
        )
        answer = response.choices[0].message.content
        tokens = {
            'prompt_tokens': response.usage.prompt_tokens,
            'completion_tokens': response.usage.completion_tokens,
            'total_tokens': response.usage.total_tokens
        }
    else:
        messages = [{"role": "user", "content": prompt}]
        response = client.chat_completion(messages, max_tokens=1000)
        
    return response.choices[0].message.content

In [63]:
def rag(model, client, mobile_phone_statement):
    """Run the RAG flow for the given mobile phone statement."""
    mobile_name_context = get_mobile_name_results(mobile_phone_statement)
    prompt = build_prompt(mobile_phone_statement, mobile_name_context)
    answer = llm(model, client, prompt)

    return answer

In [66]:
mobile_phone_statement = 'Print the list of all iphones present.'

In [73]:
rag('ollama', ollama_client, mobile_phone_statement)

'```json\n\n{\n\n    "iphone": [\n\n        {\n\n            "name": "iPhone SE (1st generation)",\n\n            "release_year": 2016,\n\n            "features": ["3.7-inch LCD display", "A9 chip"]\n\n        },\n\n        {\n\n            "name": "iPhone 8/SE (2nd generation)",\n\n            "release_year": 2curentlyreleasc,\n\n            "features": ["4.7-inch LCD screen", "A11 bionic chip"]\n\n        }\n\n    ]\n\n}\n\n```'

In [74]:
print(_)

```json

{

    "iphone": [

        {

            "name": "iPhone SE (1st generation)",

            "release_year": 2016,

            "features": ["3.7-inch LCD display", "A9 chip"]

        },

        {

            "name": "iPhone 8/SE (2nd generation)",

            "release_year": 2curentlyreleasc,

            "features": ["4.7-inch LCD screen", "A11 bionic chip"]

        }

    ]

}

```


In [75]:
mobile_phone_statement = 'Detailed specifications of iphone 16.'

In [76]:
rag('ollama', ollama_client, mobile_phone_statement)

'\n```json\n\n{\n\n    "Brand": ["Apple"],\n\n    "Model Name": ["iPhone", "XS Max", "2024 Edition"]\n\n}\n\n```\n\n---\n\n**Brand: Apple**  \n\n**Model Name: iPhone XS Max 2024 Edition**'

In [77]:
print(_)


```json

{

    "Brand": ["Apple"],

    "Model Name": ["iPhone", "XS Max", "2024 Edition"]

}

```

---

**Brand: Apple**  

**Model Name: iPhone XS Max 2024 Edition**


In [78]:
budget_df = pd.read_excel('/Users/amoghkulkarni/Downloads/1727893830166VievO8p3MWcxyp7C.xls')

ValueError: Excel file format cannot be determined, you must specify an engine manually.

In [79]:
from pathlib import Path

In [81]:
with open('/Users/amoghkulkarni/Downloads/1727893830166VievO8p3MWcxyp7C.xls', 'r') as file:
    df_budget = pd.read_excel(file)

ValueError: Excel file format cannot be determined, you must specify an engine manually.

In [82]:
!pip install openpyxl

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-1.1.0-py3-none-any.whl.metadata (1.8 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.9/250.9 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m[36m0:00:01[0mm eta [36m0:00:01[0mm
[?25hDownloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.1.5


In [91]:
data = pd.read_excel("/Users/amoghkulkarni/Downloads/1727893830166VievO8p3MWcxyp7C.xls", engine='xlrd')

XLRDError: Unsupported format, or corrupt file: Expected BOF record; found b'Account '

In [88]:
!pip install html5lib

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting html5lib
  Downloading html5lib-1.1-py2.py3-none-any.whl.metadata (16 kB)
Downloading html5lib-1.1-py2.py3-none-any.whl (112 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.2/112.2 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m[31m1.6 MB/s[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: html5lib
Successfully installed html5lib-1.1


In [90]:
!pip install xlrd

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting xlrd
  Downloading xlrd-2.0.1-py2.py3-none-any.whl.metadata (3.4 kB)
Downloading xlrd-2.0.1-py2.py3-none-any.whl (96 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.5/96.5 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m:01[0m
[?25hInstalling collected packages: xlrd
Successfully installed xlrd-2.0.1


In [94]:
data = pd.read_excel("/Users/amoghkulkarni/Downloads/1727893830166VievO8p3MWcxyp7C.xlsx", engine='openpyxl')

BadZipFile: File is not a zip file