In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
cols_to_delete = [
    "sub",
    "imageUrl",
    "totalReviews",
    "certificationStatus",
    "activeStatus",
    "adminActiveStatus",
    "ownerAllowStatus",
    "freeLanceDocument",
    "encryptedId",
    "qrCodeUrl",
    "id",
]

social_media = [
    "Facebook",
    "Instagram",
    "Unknown",
    "Twitter",
    "Unknown",
    "AppStore",
    "PlayStore",
    "Whatsapp",
    "Telegram",
    "Website",
    "Unknown",
    "TikTok",
    "Unknown",
    "Unknown",
    "Unknown",
    "Unknown",
    "Unknown",
    "Unknown",
]

In [3]:
def generate_data():
    with open("new_data.jsonl", "r", encoding="utf-8") as file:
        for line in file.readlines():
            product = json.loads(line[:-1])
            for col in cols_to_delete:
                product.pop(col)
            consoles = product.pop("consoles")
            socials = {}
            if consoles:
                for c in consoles:
                    socials[social_media[c["providerType"]]] = c["url"]
                product["socialMedia"] = socials
            yield product

In [4]:
data = generate_data()

In [5]:
df = pd.json_normalize(data)

In [6]:
df.replace(r"\.{2,}", "", regex=True, inplace=True)

In [7]:
import re
from typing import List

In [8]:
def camel_to_title(string: str) -> str:
    pattern = re.compile(r"([A-z][a-z]+)")
    array = string.split(".")
    words = []
    for w in array:
        matches: List[str] = pattern.findall(w)
        for word in matches:
            word = word.title()
            words.append(word)

    return " ".join(words)

In [9]:
df.drop_duplicates(inplace=True)

In [10]:
df.dropna(thresh=2000, axis="columns", inplace=True)

In [11]:
df[df["businessType.name"] == "Real estate"]

Unnamed: 0,userId,nameAr,nameEn,description,rating,otherTypeName,businessType.name,businessType.key,businessType.id,businessSubType.name,...,socialMedia.Whatsapp,socialMedia.Twitter,socialMedia.Facebook,socialMedia.Telegram,socialMedia.Instagram,socialMedia.Website,socialMedia.TikTok,socialMedia.Unknown,cr.number,cr.name
98,bccaa119-c73a-4571-b887-1460fcbfff2a,مؤسسة كراء المساكن للخدمات العقارية,,نقوم بتوثيق عقود الإيجارات السكنية والتجارية و...,5.0,,Real estate,RealEstate,48,Other,...,966564455202,https://twitter.com/kiraa_ksa,https://www.facebook.com/%D9%85%D8%A4%D8%B3%D8...,,,https://www.kiraa-sa.com,,,4030381211,مؤسسة كراء المساكن للخدمات العقارية
113,68edee9d-2ce1-4583-8254-69229f282d06,مؤسسة الحربي الدولية للمقاولات,Alharbi international company,بناء المساجد و المدارس في داخل المملكه وخارج...,5.0,,Real estate,RealEstate,48,Other,...,966506224674,https://twitter.com/hassan102est,,,,https://forms.gle/7yqBy3Cp47yB6PSW7,,,5864525134,مؤسسة الحربي الدولية للمقاولات
353,5cbffccd-0318-4261-aae4-30ff81430a9e,المسكان العقارية لتقديم الخدمات العقارية,AlMouskan to provide real estate services,تقديم الخدمات العقارية باحترافية بيع وشراء وتأ...,5.0,,Real estate,RealEstate,48,Other,...,966546457740,https://twitter.com/almouskan,https://www.facebook.com/profile.php?id=100000...,,,https://www.facebook.com/-431203090385/,,,4030161502,مؤسسة المسكان العقارية
525,605bcdb2-7778-4a9a-8a8d-7b2d8e5b0cd6,ثريا للوساطة العقارية,THURAYA,مؤسسة مختصة بالوساطة العقارية وتقديم الخدمات ا...,5.0,,Real estate,RealEstate,48,Real estate broker,...,966583440437,https://twitter.com/GhareebStore,,,,,https://www.tiktok.com/@thuraya.in?lang=ar,,,
605,34b269c4-c8c0-4c43-9fc2-dbdbf3c4c318,مجموعة هتيلز الفندقية,Hotelz Group,مجموعة فندقية في مكة المكرمة بجوار الحرم المكي...,5.0,,Real estate,RealEstate,48,Other,...,,https://twitter.com/HotelzGroup,,,https://instagram.com/hotelzgroup,www.HotelzGroup.com,,,4031086792,شركة زي الفندقية
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58773,f13d0fc4-93dc-4e9f-b73b-aa9c6f2206a6,سهالة,Ease,نقدم جميع الخدمات العقارية\r\nالوساطة في البيع...,0.0,,Real estate,RealEstate,48,Other,...,966554421041,https://twitter.com/SA_Ease,,,,https://ease.sa,,,4030381304,مؤسسة سهالة للخدمات العقارية
58776,61485c52-7183-4725-be0f-aa5bcd96fdc7,مكتب سعد مطر البلادي للعقار,sadsaf,وسيط عقاري معتمد,0.0,,Real estate,RealEstate,48,Other,...,966555546264,https://twitter.com/sad11811,,,,,,,4031062287,مكتب سعد مطر البلادي للعقار
58825,ae5c6637-0688-4cc0-a198-4b5e8a17a65f,جنى المستقبل,,تأجير شقق مفروشة شهري سنوي,0.0,,Real estate,RealEstate,48,Other,...,966502008500,,,,,https://jana-jed.com,,,4030314515,مؤسسة جنى المستقبل للعقارات
58826,a57f4051-67b4-44fe-9e63-01ddcbc339c5,رواد المقابض,Pioneers of the handles,مواد البناء,0.0,,Real estate,RealEstate,48,Other,...,966554710006,,,,,https://zid.store/pioner,,,2251101132,مؤسسة رواد المقابض التجارية


In [12]:
df.shape

(58851, 41)

In [14]:
df.sample(50).to_excel('sample.xlsx', index=False)