In [1]:
import pandas as pd
import numpy as np
import json
import re

In [2]:
with open('results.jsonl', 'r', encoding='utf-8') as file:
    data = (json.loads(line[:-1]) for line in file.readlines())

In [3]:
df = pd.DataFrame(data)

In [4]:
df = df[[
    'اسم الشركة',
    'website',
    'وصف الشركة',
    'عنوان الشركة',
    'الهاتف',
    'email',
    'المدينة',
    'القسم',
    'تاريخ الإضافة:',
    'إتصل',
    'واتس أب',
    'عبر الهاتف',
    'رسالة عبر الأيميل',
    # 'link',
]]

In [5]:
df.drop_duplicates(inplace=True)

In [6]:
df.columns = [
    'title',
    'website',
    'description',
    'address',
    'phone',
    'email',
    'city',
    'category',
    'add_date',
    'phone_2',
    'whatsapp',
    'phone_3',
    'email_2',
    # 'link',
]

In [7]:
def cf_decode_email(encodedString):
    r = int(encodedString[:2], 16)
    email = ''.join([chr(int(encodedString[i:i+2], 16) ^ r)
                    for i in range(2, len(encodedString), 2)])
    return email

In [8]:
def parse_email(email):
    if type(email) == str:
        enc_email = email.split('#')[-1]
        return cf_decode_email(enc_email)
    else:
        return email

In [9]:
df['email_2'] = df['email_2'].apply(parse_email)

In [10]:
df['whatsapp'] = df['whatsapp'].str.extract(r'(\d+)')

In [11]:
df['phone_2'] = df['phone_2'].str.extract(r'(\d+)')

In [12]:
df['phone_3'] = df['phone_3'].str.extract(r'(\d+)')

In [13]:
phones_df = df['phone'].apply(lambda x: re.compile(
    r'\d{4}\s?\d{4}\b').findall(str(x))).apply(pd.Series)

In [14]:
phones_df.dropna(thresh=len(phones_df)*0.1, axis='columns', inplace=True)

In [15]:
phones_df.columns = ['phone_e_1', 'phone_e_2', 'phone_e_3']

In [16]:
phones_df['phone_e_1'] = phones_df['phone_e_1'].str.replace(' ',
                                                            '',
                                                            regex=True)
phones_df['phone_e_2'] = phones_df['phone_e_2'].str.replace(' ',
                                                            '',
                                                            regex=True)
phones_df['phone_e_3'] = phones_df['phone_e_2'].str.replace(' ',
                                                            '',
                                                            regex=True)

In [17]:
df = pd.concat([df, phones_df], axis='columns')

In [18]:
df['email'].replace(np.nan, '', inplace=True)

In [19]:
df['email_final'] = df.apply(lambda x: x['email_2'] if x['email'] == '' else x['email'],
                             axis='columns')

In [20]:
df.drop(columns=['email', 'email_2'], inplace=True)
df.rename({'email_final': 'email'}, inplace=True, axis='columns')

In [21]:
df = df[[
    'title',
    'category',
    'description',
    'city',
    'address',
    'email',
    'website',
    'whatsapp',
    'phone_2',
    'phone_3',
    'phone_e_1',
    'phone_e_2',
    'phone_e_3',
    'phone',
    'add_date',
    # 'link',
]]

In [22]:
df.rename({
    'phone_2': 'phone_1',
    'phone_3': 'phone_2',
    'phone_e_1': 'phone_3',
    'phone_e_2': 'phone_4',
    'phone_e_3': 'phone_5',
    'phone': 'other_contact_details',
}, axis='columns', inplace=True)

In [23]:
df = df.reset_index(drop=True)

In [24]:
df

Unnamed: 0,title,category,description,city,address,email,website,whatsapp,phone_1,phone_2,phone_3,phone_4,phone_5,other_contact_details,add_date
0,جست جيم Just-Gym,شركات الاجهزة الرياضية,جست جيم Just-Gym,الدوحة,"جست,جيم,,Al,Sad,Sports,Club,,Al,Wa'ab,st.",reception@mtmgroup.com.qa,https://www.facebook.com/Just-Gym-119294094816336,,,,44594211,,,97444594211,1/1/2019
1,Ali Bin Ali Travel,شركات السياحة والسفر,"Ali Bin Ali Travel, Tourism &amp; Cargo divisi...",الدوحة,"Al Sadd, Al Sadd Commercial Center - Al Sadd s...",travel@alibinali.com,https://www.alibinalitravel.com/,97444441161,97444441161,97444441161,,,,,24/5/2023
2,Rosabella_Beauty Center,مراكز صالونات التجميل,DIVE INTO THE WORLD OF BEAUTY\r\nYOUR BEAUTY O...,الدوحة,"Zkreet 595 , Al Khuraytiyat, Qatar",Rosabellaqatar@gmail.com,https://rosabella-beauty-center.business.site/...,97450910153,97450910153,97450910153,,,,,20/6/2023
3,Qatar Factory,شركات الأمن وكاميرات المراقبة,Fire Protection is a way of life for QATAR FAC...,الدوحة,"Building No 195, Zone 81, Street 23 K, New, Do...",info@qatarfactory.qa,http://www.qatarfactory.qa/,97444114630,97444114630,97444025888,,,,,7/6/2023
4,Contour Boutique Saloon,مراكز صالونات التجميل,Contour Boutique offers world-class hair dress...,الدوحة,"Doha, Qatar",contourboutiquedoha@gmail.com,https://contourbotiquedoha.wixsite.com/main,97450960456,97450960456,97450960456,,,,,20/6/2023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78588,OSOUL TRADING & CONTRACTING WLL,شركات المقاولات,construction,الدوحة,doha,osoul2016@gmail.com,http://www.dirqtr.com,,,,66553859,,,Phone 66553859,9/10/2018
78589,TECHNICAL DEVELOPMENT OF ELECTRO MECHANICAL CO...,شركات المقاولات,construction,الدوحة,doha,mohammad.a@tdec.qa,http://www.tdec.qa/,,,,44171663,33171778,33171778,Phone 44171663 33171778,9/10/2018
78590,AL AREEKAH CO WLL,شركات المقاولات,construction,الدوحة,doha,info@alareekah.com,http://www.alareekah.com/,,,,44602219,55835230,55835230,44602219 55835230,13/10/2018
78591,WHITE STROK TRADING CONTRACTING & CLEANING WLL,شركات المقاولات,construction,الدوحة,doha,wstorkq@gmail.com,http://www.dirqtr.com,,,,33031270,,,Phone 33031270,9/10/2018
