### Web scrapping for [maroof Website]

In [1]:
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd

### get store IDs

In [5]:
# import requests
# import pandas as pd
# from concurrent.futures import ThreadPoolExecutor, as_completed

# API URLs
api_urls = []

for i in range(0, 66001, 500):
    api_urls.append("https://api.thiqah.sa/maroof/public/api/app/business/search?keyword=&businessTypeId=&businessTypeSubCategoryId=&regionId=&cityId=&certificationType=&sortBy=2&sortDirection=2&sorting=&skipCount={}&maxResultCount=500".format(i))

# API key and headers
api_key = "c1qesecmag8GSbxTHGRjfnMFBzAH7UAN"
headers = {'apikey': api_key}

# Fetch store information from API
def fetch_store_info(api_url):
    response = requests.get(api_url, headers=headers)
    if response.status_code == 200:
        print("Done")
    else:
        print(f"Failed to retrieve data from {api_url}")
        return []
    
    data = response.json()
    store_info = []

    # Extracting information for stores with at least 50 reviews
    for item in data.get('items', []):
        if item.get('totalReviews', 0) >= 10:
            store_info.append({
                'Name': item.get('name'),
                'NameAr': item.get('nameAr'),
                'LocalizedName': item.get('localizedName'),
                'BusinessType': item.get('businessType', {}).get('name'),
                'OtherTypeName': item.get('otherTypeName'),
                'IsPopularBusiness': item.get('isPopularBusiness'),
                'TotalReviews': item.get('totalReviews'),
                'Rating': item.get('rating'),
                'Id': item.get('id'),
                'ActiveStatus': item.get('activeStatus'),
                'OwnerAllowStatus': item.get('ownerAllowStatus'),
                'CertificationStatus': item.get('certificationStatus')
            })

    return store_info

# Fetch store info in parallel
def fetch_store_info_parallel(api_urls):
    all_store_info = []
    with ThreadPoolExecutor(max_workers=20) as executor:
        future_to_url = {executor.submit(fetch_store_info, url): url for url in api_urls}
        for future in as_completed(future_to_url):
            try:
                result = future.result()
                if result:
                    all_store_info.extend(result)
            except Exception as exc:
                print(f"Exception occurred: {exc}")
    
    return all_store_info

# Fetch store information and put into dataframe
store_info = fetch_store_info_parallel(api_urls)

# Convert to DataFrame
stores_df = pd.DataFrame(store_info)

# Display the dataframe
stores_df


Done
Done
Done
Done
Done
Done


In [19]:
stores_df.shape

(66400, 12)

In [20]:
stores_df.to_csv('all_stores.csv', index=False)


In [5]:
df10 = stores_df[(stores_df['TotalReviews'] >=10)]

In [None]:
df10.to_csv('10_Review_Stores.csv')

### Get stores comments and reviews

In [6]:
df10['TotalReviews'].value_counts(ascending=False)

TotalReviews
11     107
10      76
12      62
13      58
15      47
      ... 
255      1
967      1
98       1
171      1
656      1
Name: count, Length: 167, dtype: int64

In [4]:
df10.shape

(66400, 12)

In [2]:
df = pd.read_csv('all_stores.csv')

In [3]:

# Base URL for the API endpoint
api_url_template = "https://api.thiqah.sa/maroof/public/api/app/business/{store_id}/reviews?skipCount=0&maxResultCount=10000&sortColumn=&sortDirection=2&onlyReviewsWithComments=false"

# API key
api_key = "c1qesecmag8GSbxTHGRjfnMFBzAH7UAN"
headers = {
    'apikey': f'{api_key}'
}

def get_store_reviews(store_id):
    api_url = api_url_template.format(store_id=store_id)
    
    response = requests.get(api_url, headers=headers)

    if response.status_code != 200:
        print("Failed to retrieve data.", store_id)
        return None

    try:
        data = response.json()
    except ValueError as e:
        print(f"Error parsing JSON: {e}")
        return None
    
    reviews = []
    for item in data.get('items', []):
        review = {
            'BusinessId': store_id,
            'review_id': item.get('id'),
            'rating': item.get('rating'),
            'comment': item.get('comment'),
            'creation_date': item.get('creationDate')
        }
        reviews.append(review)
    
    return reviews

def fetch_reviews_parallel(store_ids):
    all_reviews = []
    with ThreadPoolExecutor(max_workers=20) as executor:
        future_to_store_id = {executor.submit(get_store_reviews, store_id): store_id for store_id in store_ids}
        for future in as_completed(future_to_store_id):
            store_id = future_to_store_id[future]
            try:
                result = future.result()
                if result:
                    all_reviews.extend(result)
            except Exception as exc:
                print(f"{store_id} generated an exception: {exc}")
    return all_reviews


# Fetch reviews for each store in parallel
all_reviews = fetch_reviews_parallel(list(df10['Id'].values))

# Convert to DataFrame
df_reviews = pd.DataFrame(all_reviews)

# Print the DataFrame
print("DataFrame:")
# print(df)
df_reviews
# Optionally, save to a CSV file


115639 generated an exception: HTTPSConnectionPool(host='api.thiqah.sa', port=443): Max retries exceeded with url: /maroof/public/api/app/business/115639/reviews?skipCount=0&maxResultCount=10000&sortColumn=&sortDirection=2&onlyReviewsWithComments=false (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x0000024786BA9D50>, 'Connection to api.thiqah.sa timed out. (connect timeout=None)'))
Error parsing JSON: Expecting value: line 1 column 1 (char 0)
Error parsing JSON: Expecting value: line 1 column 1 (char 0)
Error parsing JSON: Expecting value: line 1 column 1 (char 0)
Error parsing JSON: Expecting value: line 1 column 1 (char 0)
Error parsing JSON: Expecting value: line 1 column 1 (char 0)
Error parsing JSON: Expecting value: line 1 column 1 (char 0)
Error parsing JSON: Expecting value: line 1 column 1 (char 0)
Error parsing JSON: Expecting value: line 1 column 1 (char 0)
Error parsing JSON: Expecting value: line 1 column 1 (char 0)
Error parsing JSON: Expec

In [8]:
df_reviews['BusinessId'].value_counts(ascending=False)

BusinessId
39976     4596
77092     3333
48246     1813
24519     1553
257509    1204
          ... 
95661       10
91678       10
86949       10
95180       10
60202       10
Name: count, Length: 1020, dtype: int64

In [9]:
df_reviews.to_csv('10_Reviews.csv')