In [2]:
import http.client
from urllib.parse import urlencode
from bs4 import BeautifulSoup
import concurrent.futures
import threading
import pandas as pd
import gzip
import io
from fake_useragent import UserAgent
ua = UserAgent()

# Thread-local data to store per-thread HTTPSConnection
thread_local = threading.local()

def get_connection():
    if not hasattr(thread_local, 'conn'):
        thread_local.conn = http.client.HTTPSConnection("www.flipkart.com", timeout=10)
    return thread_local.conn

def get_fsn(search, page):
    params = {
        'q': search,
        'page': page
    }
    query_string = urlencode(params)
    path = f"/search?{query_string}"
    headers = {
        'User-Agent': ua.random,
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'en-US,en;q=0.9',
        'Connection': 'keep-alive',
    }

    try:
        conn = get_connection()
        conn.request("GET", path, headers=headers)
        res = conn.getresponse()
        if res.status != 200:
            print(f"Error fetching page {page} for search '{search}': {res.status} {res.reason}")
            res.close()
            return []
        data = res.read()
        res.close()

        # Handle gzip encoding if present
        encoding = res.getheader('Content-Encoding')
        if encoding == 'gzip':
            buf = io.BytesIO(data)
            f = gzip.GzipFile(fileobj=buf)
            data = f.read()

        soup = BeautifulSoup(data, 'html.parser')
        elements_with_data_id = soup.find_all(attrs={'data-id': True})
        data_list = []
        for position, element in enumerate(elements_with_data_id, start=1):
            data_id = element['data-id']
            data_list.append({
                'search_query': search,
                'position': position,
                'page_no': page,
                'data_id': data_id
            })

        return data_list

    except Exception as e:
        print(f"Exception in get_fsn for search '{search}' page {page}: {e}")
        return []

# 

In [30]:
import http.client
from urllib.parse import urlencode
from bs4 import BeautifulSoup
import concurrent.futures
import threading
import pandas as pd
import gzip
import io
from fake_useragent import UserAgent
import time

ua = UserAgent()
thread_local = threading.local()

MAX_RETRIES = 3  # Maximum retries for each request

def get_connection():
    if not hasattr(thread_local, 'conn'):
        thread_local.conn = http.client.HTTPSConnection("www.flipkart.com", timeout=10)
    return thread_local.conn

def get_fsn(search, page, retries=MAX_RETRIES):
    try:
        params = {
            'q': search,
            'page': page
        }
        query_string = urlencode(params)
        path = f"/search?{query_string}"
        headers = {
            'User-Agent': ua.random,
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'en-US,en;q=0.9',
            'Connection': 'keep-alive',
        }


        conn = get_connection()
        conn.request("GET", path, headers=headers)
        res = conn.getresponse()
        if res.status != 200:
            print(f"Error fetching page {page} for search '{search}': {res.status} {res.reason}")
            res.close()


        data = res.read()
        res.close()

        # Handle gzip encoding if present
        encoding = res.getheader('Content-Encoding')
        if encoding == 'gzip':
            buf = io.BytesIO(data)
            f = gzip.GzipFile(fileobj=buf)
            data = f.read()

        soup = BeautifulSoup(data, 'html.parser')
        elements_with_data_id = soup.find_all(attrs={'data-id': True})
        data_list = []
        for position, element in enumerate(elements_with_data_id, start=1):
            data_id = element['data-id']
            data_list.append({
                'search_query': search,
                'position': position,
                'page_no': page,
                'data_id': data_id
            })

        return data_list  # Return successful result list
    except:
        pass


# Usage example with concurrent futures, storing successful and failed outputs separately
def fetch_all_fsn(barcode_list):
    output = []
    failed = []
    max_workers = 50

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(get_fsn, barcode, 1): barcode for barcode in barcode_list}
        
        for future in concurrent.futures.as_completed(futures):
            barcode = futures[future]
            result = future.result()
            
            if result is not None:
                output.extend(result)  # Add successful results to output
            else:
                failed.append(barcode)  # Log failed barcodes in the failed list

    return output, failed




In [17]:
from piTask import general

In [18]:
input_sheet='1_20dFg5dw64P7A6_EjErSwNrc8cNMF0JAP1Rd7TfCgM'

In [33]:
df=general.read_sheet(input_sheet,'Sheet1',1)

d:\personal_git\data science\portfolio projects\competition_analysis\key1.json


In [34]:
import pandas as pd

In [35]:
all_data=[]

In [46]:

barcode_list=df['Barcode'].tolist()
for i in range(10):
    print(len(barcode_list))
    output, failed = fetch_all_fsn(barcode_list)
    all_data.extend(output)
    df_ssipl=pd.DataFrame(output)
    vol1=df_ssipl[df_ssipl['position']==1]
    # all_data.append(vol1)
    scraped=list(set(vol1['search_query'].tolist()))
    df=df[~df['Barcode'].isin(scraped)]
    barcode_list=df['Barcode'].tolist()

4863


KeyError: 'position'

In [47]:
df

Unnamed: 0,Item Nature.,HSN CODE.,Brand.,Season Desc.,Season sub Group,Sub Category2 Desc,Sub Category2 Code,GENDER.,Sub Category Desc,Sub Category Code,...,Barcode,Color,Item Name,Item ID,MRP,Size,Total,Sizes option,Remarks,vlookup
0,2024,61034200,PUMA,NON-WINTER,SS24,CREW,CREW,MEN,TOP,TOP,...,4067979134309,ROZE QUART,ESS Stylized Colorbl,68194547,2499,XS,1,5,Retail Articles Apparels,#REF!
1,2024,61034200,PUMA,NON-WINTER,SS24,LOWER,LOWER,MEN,BOTTOM,BOTTOM,...,4065453841743,DARK GRAY,Mens Formstrip Logo,67530207,2999,M,1,3,Retail Articles Apparels,#REF!
2,2024,61034200,PUMA,NON-WINTER,SS24,LOWER,LOWER,MEN,BOTTOM,BOTTOM,...,4065453841767,DARK GRAY,Mens Formstrip Logo,67530207,2999,XL,1,3,Retail Articles Apparels,#REF!
3,2024,61034200,PUMA,NON-WINTER,SS24,LOWER,LOWER,MEN,BOTTOM,BOTTOM,...,4067979128360,PUMA BLACK,Slub Pants,67293398,2999,XS,1,6,Retail Articles Apparels,#REF!
4,2024,61034200,PUMA,NON-WINTER,SS24,LOWER,LOWER,MEN,BOTTOM,BOTTOM,...,4067979128377,PUMA BLACK,Slub Pants,67293398,2999,S,7,6,Retail Articles Apparels,#REF!
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6376,SU23,64041190,NIKE,FOOTWEAR,SU23,LACEUP,LACEUP,WOMEN,SHOE,SHOE,...,196604450485,100,WMNS NIKE QUEST 5 PRM,FB6944-100,7095,6,1,2,Retail Articles Apparels,#REF!
6377,SU23,64041190,NIKE,FOOTWEAR,SU23,SHOE,SHOE,MEN,SHOE,SHOE,...,195237002658,BLACK/BLAC,NIKE AIR MAX AP,CU4826-001,8295,7,3,4,Retail Articles Apparels,#REF!
6378,SU23,64041190,NIKE,FOOTWEAR,SU23,SHOE,SHOE,MEN,SHOE,SHOE,...,195237002672,1,NIKE AIR MAX AP,CU4826-001,8295,8,8,4,Retail Articles Apparels,#REF!
6379,SU23,64041190,NIKE,FOOTWEAR,SU23,SHOE,SHOE,MEN,SHOE,SHOE,...,195237002696,1,NIKE AIR MAX AP,CU4826-001,8295,9,7,4,Retail Articles Apparels,#REF!


In [43]:
len(all_data)

4478

In [51]:
final_df=pd.DataFrame(all_data)

In [54]:
final_df=final_df[final_df['position']==1]

In [55]:
final_df.to_csv('ssipl.csv')

In [89]:
df_ssipl=df_ssipl.drop_duplicates(subset='search_query')

In [90]:
df_ssipl.to_csv('ssipl.csv')

In [48]:
barcode_data=df_ssipl[df_ssipl['position'] ==1]['search_query'].tolist()


In [50]:
barcode_unique_data=list(set(barcode_data))

In [55]:
data_through_barcode_missing=df[~df['Barcode'].isin(barcode_unique_data)]

In [68]:
data_through_barcode_missing['key']=data_through_barcode_missing['Brand.']+" "+data_through_barcode_missing['Item Name'].tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_through_barcode_missing['key']=data_through_barcode_missing['Brand.']+" "+data_through_barcode_missing['Item Name'].tolist()


In [74]:
st_list=data_through_barcode_missing['key'].to_list()

In [79]:
st_list=list(set(st_list))

In [80]:
len(st_list)

154

In [81]:
output=[]
for barcode in st_list:
    print(barcode)
    fsn_list=get_fsn(barcode, 1)
    output.extend(fsn_list)

ASICS AHQ DYNABLAST 4 MEN STANDARD
Exception in get_fsn for search 'ASICS AHQ DYNABLAST 4 MEN STANDARD' page 1: Remote end closed connection without response
NIKE NIKE FUNDAMENTAL SPEED ROPE
NIKE NIKE BIG MOUTH BOTTLE 2.0 22 OZ GRAPHIC
NIKE NIKE AIR MAX SOLO
NIKE Quest 5
NIKE NIKE COURT LEGACY NN
NIKE WMNS NIKE QUEST 5 PRM
NIKE FB5369-512 U NK CLUB CAP U CB SWSH L
NIKE Waffle Nav
NIKE NIKE ADVANTAGE KNITTED ELBOW SLEEVE
ASICS SOLUTION SWIFT FF
NIKE Revolution 7
NIKE Journey Run
NIKE Downshifter 13
NIKE Interact Run
NIKE NIKE VERSA TACK 8P
NIKE NIKE TR RECHARGE TWIST BOTTLE 24 OZ
ASICS SOLUTION SPEED FF 3
NIKE NIKE M EXTREME FG
NIKE NIKE EVERYDAY PLAYGROUND 8P DEFLATED
NIKE NIKE REFUEL BOTTLE LOCKING LID 24 OZ
NIKE AIR ZOOM PEGASUS 40
NIKE Full Force Lo
NIKE NIKE HEADBAND NBA
NIKE NIKE FREE RN FK NEXT NATURE
NIKE JORDAN JUMPMAN TERRY HEADBAND
NIKE FB5369-010 U NK CLUB CAP U CB SWSH L
NIKE ELBOW SLEEVE
ASICS AHQ GEL-DEDICATE 8 MEN STANDARD
DOUBLEU V SHAPE CLASSIC MEN PAINTED
NIKE AS M NK

In [91]:
ssipl_model_level=pd.DataFrame(output)

In [92]:
ssipl_model_level.to_csv("ssipl model level.csv")

In [78]:
ssipl_model_level

Unnamed: 0,search_query,position,page_no,data_id
0,DOUBLEU SAKURA STRAP BUCKLE (W2),1,1,SBWGNFY7PJZAHSSZ
1,DOUBLEU SAKURA STRAP BUCKLE (W2),2,1,SBWGM3NVEUV6FJFF
2,DOUBLEU SAKURA STRAP BUCKLE (W2),3,1,SBWGM3KTBMBUSGGA
3,DOUBLEU SAKURA STRAP BUCKLE (W2),4,1,SBWGM3ZF5QJY2SWD
4,DOUBLEU SAKURA STRAP BUCKLE (W2),5,1,SBWGMWH8PAGW3MUG
...,...,...,...,...
12107,NIKE NIKE MOVE YOGA MAT 4 MM,36,1,SMTHYZYQHQ5UJJAX
12108,NIKE NIKE MOVE YOGA MAT 4 MM,37,1,SMTHYZYQXZZH5ZY5
12109,NIKE NIKE MOVE YOGA MAT 4 MM,38,1,SMTGXRKGGU5WR7SS
12110,NIKE NIKE MOVE YOGA MAT 4 MM,39,1,SMTHYZYPGSNFE6Z7
