In [2]:
import http.client
from urllib.parse import urlencode
from bs4 import BeautifulSoup
import concurrent.futures
import threading
import pandas as pd
import gzip
import io

# Thread-local data to store per-thread HTTPSConnection
thread_local = threading.local()

def get_connection():
    if not hasattr(thread_local, 'conn'):
        thread_local.conn = http.client.HTTPSConnection("www.flipkart.com", timeout=10)
    return thread_local.conn

def get_fsn(search, page):
    params = {
        'q': search,
        'page': page
    }
    query_string = urlencode(params)
    path = f"/search?{query_string}"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/85.0.4183.102 Safari/537.36',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'en-US,en;q=0.9',
        'Connection': 'keep-alive',
    }

    try:
        conn = get_connection()
        conn.request("GET", path, headers=headers)
        res = conn.getresponse()
        if res.status != 200:
            print(f"Error fetching page {page} for search '{search}': {res.status} {res.reason}")
            res.close()
            return []
        data = res.read()
        res.close()

        # Handle gzip encoding if present
        encoding = res.getheader('Content-Encoding')
        if encoding == 'gzip':
            buf = io.BytesIO(data)
            f = gzip.GzipFile(fileobj=buf)
            data = f.read()

        soup = BeautifulSoup(data, 'html.parser')
        elements_with_data_id = soup.find_all(attrs={'data-id': True})
        data_list = []
        for position, element in enumerate(elements_with_data_id, start=1):
            data_id = element['data-id']
            data_list.append({
                'search_query': search,
                'position': position,
                'page_no': page,
                'data_id': data_id
            })

        return data_list

    except Exception as e:
        print(f"Exception in get_fsn for search '{search}' page {page}: {e}")
        return []

def collect_all_data_ids(search_queries, start_page, end_page):
    all_data = []
    tasks = [(search, page) for search in search_queries for page in range(start_page, end_page + 1)]

    with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
        future_to_task = {executor.submit(get_fsn, search, page): (search, page) for (search, page) in tasks}

        for future in concurrent.futures.as_completed(future_to_task):
            search, page = future_to_task[future]
            try:
                data_list = future.result()
                print(f"Search '{search}' Page {page}: Found {len(data_list)} data-ids")
                all_data.extend(data_list)
            except Exception as exc:
                print(f"Search '{search}' Page {page} generated an exception: {exc}")

    return all_data

if __name__ == "__main__":
    search_queries = [
        'bluetooth earphone'
    ]
    start_page = 1
    end_page = 25

    all_data = collect_all_data_ids(search_queries, start_page, end_page)
    print(f"Total data-id attributes found: {len(all_data)}")

    # Create DataFrame
    df = pd.DataFrame(all_data, columns=['search_query', 'position', 'page_no', 'data_id'])

    # Now you can print the DataFrame, save it, or process it further
    print(df.head())


Search 'bluetooth earphone' Page 2: Found 40 data-ids
Search 'bluetooth earphone' Page 1: Found 40 data-ids
Search 'bluetooth earphone' Page 3: Found 40 data-ids
Search 'bluetooth earphone' Page 4: Found 40 data-ids
Search 'bluetooth earphone' Page 5: Found 40 data-ids
Search 'bluetooth earphone' Page 6: Found 40 data-ids
Search 'bluetooth earphone' Page 7: Found 40 data-ids
Search 'bluetooth earphone' Page 8: Found 40 data-ids
Search 'bluetooth earphone' Page 9: Found 40 data-ids
Search 'bluetooth earphone' Page 10: Found 40 data-ids
Search 'bluetooth earphone' Page 11: Found 40 data-ids
Search 'bluetooth earphone' Page 12: Found 40 data-ids
Search 'bluetooth earphone' Page 13: Found 40 data-ids
Search 'bluetooth earphone' Page 14: Found 40 data-ids
Search 'bluetooth earphone' Page 15: Found 40 data-ids
Search 'bluetooth earphone' Page 16: Found 40 data-ids
Search 'bluetooth earphone' Page 17: Found 40 data-ids
Search 'bluetooth earphone' Page 18: Found 40 data-ids
Search 'bluetooth e

In [8]:
all_fsns = df['data_id'].to_list()
final_fsn_list = list(set(all_fsns))

In [9]:
from scrapper import flipkart_json_scrapper_with_all_specifications as fk_scrapper

In [11]:
competitor_data=fk_scrapper.scrape_all_fsns(final_fsn_list)

In [12]:
competitor_data['brand'] = competitor_data['title'].str.split(" ").str[0]

In [14]:
brand_level_data=competitor_data

In [19]:
brand_level_data['all_specs'][2]

{'Model ID': 'NB128 Happy 24 Hours Playing Time | Deep Bass| Made In India Neckband Earphone',
 'Color': 'Black, Yellow',
 'Headphone Type': 'In the Ear',
 'Inline Remote': 'Yes',
 'Sales Package': '1 Bluetooth Headset',
 'Connectivity': 'Bluetooth',
 'Headphone Design': 'Behind the Neck',
 'Compatible Devices': 'Mobile, Laptop, Tablet',
 'Net Quantity': '1',
 'Sweat Proof': 'Yes',
 'Foldable/Collapsible': 'Yes',
 'Deep Bass': 'Yes',
 'Water Resistant': 'No',
 'Monaural': 'No',
 'Designed For': 'Android, iOS, Windows',
 'Driver Type': 'Dynamic',
 'Other Features': 'Smart Voice Assistant, Dual Device Pairing, Type-C Quick Charge',
 'Headphone Driver Units': '12 mm',
 'With Microphone': 'Yes',
 'Minimum Frequency Response': '20 Hz',
 'Maximum Frequency Response': '20000 Hz',
 'Wireless Type': 'Bluetooth',
 'Wireless Range': '10 m',
 'Bluetooth Version': 'v5.0',
 'Bluetooth Range': '10 m',
 'Headphone Power Source': 'Battery',
 'Power Supply': 'Battery Powered',
 'Battery Life': '24 hr',


In [15]:
for index, row in brand_level_data.iterrows():
    specs = row['all_specs']
    for key, value in specs.items():
        if key not in brand_level_data.columns:
            brand_level_data[key] = None
        brand_level_data.at[index, key] = value

AttributeError: 'float' object has no attribute 'items'