# Cleaning scraped data from wuzzuf website

In [2]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import plotly.express as px

## Loading the dataframe from csv file

In [3]:
data = pd.read_csv('jobs Dec-11-2022.csv')

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,title,country,city,min_salary,max_salary,currency_,vacancies,date_posted,date_expire,...,hot_score,description,requirements,roles,types,gender_pref,education_pref,bachelor_required,min_experience,company_id
0,b0b8062b-1938-4b28-9df1-a72e8cacc2d4,Sales Pharmacists,Egypt,Dakahlia,,,,10,12/10/2022 15:29:18,02/08/2023 15:29:18,...,0,<ul><li>Responding to Customers inquiries eith...,<ul><li>Pharmacy Graduates.</li><li>Fresh or e...,"['Sales/Retail', 'Pharmaceutical']",['Full Time'],,Bachelor's Degree,False,,81314.0
1,8157a7d6-8ec3-40f1-9a41-dcf9b0035b9b,Architect,Egypt,Cairo,,,,1,12/10/2022 15:27:29,02/08/2023 15:27:29,...,2,<ul><li>Control project from start to finish t...,<p>&nbsp;</p><ul><li>Bachelor’s Degree in Arch...,['Engineering - Construction/Civil/Architectur...,['Full Time'],males_only,Bachelor's Degree,False,1.0,25945.0
2,e109f748-3857-4b5d-b932-1574e55b9e09,Customer Service,Egypt,Cairo,5000.0,7000.0,"{'id': 43, 'name': 'Egyptian Pound', 'code': '...",1,12/10/2022 15:14:58,02/08/2023 15:14:58,...,12,<ul><li>Resolves problems by clarifying the cu...,<ul><li></li></ul>,['Accounting/Finance'],['Full Time'],,Not Specified,False,1.0,21702.0
3,a7d3c53b-2fc5-4557-967c-fa05ead249ee,Digital Community Facilitator - (For Mothers),Egypt,Cairo,4000.0,5000.0,"{'id': 43, 'name': 'Egyptian Pound', 'code': '...",4,12/10/2022 15:01:39,02/08/2023 15:01:39,...,2,<p>For stay-at-home bookworm mothers! We have ...,<ul><li>Dedication of 7 hours/day for work. 18...,"['Customer Service/Support', 'Writing/Editoria...",['Work From Home'],females_only,Bachelor's Degree,False,1.0,25970.0
4,0d0730f1-3c16-4bd3-8fef-e2f95829a1b0,Digital Marketing & Sales Associate (For Mothers),Egypt,Cairo,4000.0,5000.0,"{'id': 43, 'name': 'Egyptian Pound', 'code': '...",4,12/10/2022 14:59:36,02/08/2023 14:59:36,...,2,<p>Only for stay-at-home bookworm mothers; we ...,<ul><li>Fluent command of English both written...,"['Marketing/PR/Advertising', 'Sales/Retail', '...",['Work From Home'],females_only,Bachelor's Degree,False,1.0,25970.0


#### Renaming columns

In [5]:
data.rename({'Unnamed: 0':'id','currency_':'currency'},axis=1,inplace=True)

In [6]:
data = data.iloc[:,:7]

In [7]:
data['currency'] = data['currency'].str.findall(r"'code': '(...)'")

In [8]:
def unpack_list(x):
    try:
        return x[0]
    except:
        return x

In [9]:
data['currency'] =  data['currency'].apply(unpack_list)

### Analysis for the min and max salaries

In [10]:
data[data['country']=='Egypt'].groupby('city')[['min_salary','max_salary']].mean().sort_values('max_salary',ascending=False)

Unnamed: 0_level_0,min_salary,max_salary
city,Unnamed: 1_level_1,Unnamed: 2_level_1
Monufya,17666.666667,20666.666667
Red Sea,11666.666667,14000.0
Qalubia,9875.0,12250.0
Beni Suef,6500.0,11625.0
Cairo,7826.460548,11174.341385
Matruh,6500.0,11000.0
Fayoum,8000.0,10000.0
Giza,6894.08,9791.2
Gharbia,6500.0,9750.0
Alexandria,6783.333333,9416.666667


In [11]:
px.bar(data[data['country']=='Egypt'].city.value_counts())

In [12]:
px.bar(data[data['country']=='Egypt'].groupby('city')[['min_salary','max_salary']].mean().sort_values('max_salary',ascending=False),)

In [13]:
data[data['city']=='Alexandria'].sort_values('max_salary',ascending=False).head(20).iloc[:,1:]

Unnamed: 0,title,country,city,min_salary,max_salary,currency
5496,Full Stack Developer (ASP.Net Core & Angular 9...,Egypt,Alexandria,15000.0,30000.0,EGP
993,Senior Brand Designer,Egypt,Alexandria,8000.0,20000.0,EGP
1285,Chairman Office Manager - Alexandria,Egypt,Alexandria,12000.0,18000.0,EGP
4088,Flutter Developer,Egypt,Alexandria,12000.0,15000.0,EGP
6828,Customer Service Advisor - German - Alexandria,Egypt,Alexandria,13000.0,15000.0,EGP
4457,Senior Front End Developer,Egypt,Alexandria,10000.0,14000.0,EGP
2760,Videographer,Egypt,Alexandria,10000.0,14000.0,EGP
6830,Customer Service Advisor - Portuguese - Alexan...,Egypt,Alexandria,12000.0,14000.0,EGP
3399,Italian Customer Service Representative,Egypt,Alexandria,11500.0,12500.0,EGP
3280,Customer Service Representative - (Italian Spe...,Egypt,Alexandria,10000.0,12500.0,EGP


In [14]:
data[(data['city']=='Cairo') | (data['city']=='Giza')].sort_values('max_salary',ascending=False).head(20).iloc[:,1:]

Unnamed: 0,title,country,city,min_salary,max_salary,currency
1928,Chief Technology Officer,Egypt,Cairo,70000.0,100000.0,EGP
2795,( Electrical Substation Construction Project M...,Egypt,Cairo,50000.0,70000.0,EGP
2632,Electrical Substation Construction Project Man...,Egypt,Cairo,50000.0,70000.0,EGP
5129,IT Manager EXHIBTA &Sales Buzz Administrator,Egypt,Cairo,40000.0,60000.0,EGP
7430,Internal Audit Executive,Egypt,Giza,12000.0,50000.0,EGP
1390,Head OF Operations,Egypt,Cairo,30000.0,50000.0,EGP
6479,Senior Programmer,Egypt,Cairo,30000.0,50000.0,EGP
5730,Senior Project Manager (Banking),Egypt,Cairo,25000.0,50000.0,EGP
5258,School Principal (American & IB),Egypt,Cairo,40000.0,50000.0,EGP
2155,Scrum Master Engineer,Egypt,Cairo,22000.0,45000.0,EGP


In [15]:
px.bar(data['title'].value_counts().head(20))

In [16]:
px.bar(data[data['title'].str.contains('data',flags=re.IGNORECASE)].title.value_counts().head(20))

In [17]:
data[data['title'] == 'Data Analyst']

Unnamed: 0,id,title,country,city,min_salary,max_salary,currency
490,18a859b4-3f37-403c-9eab-39545db32d91,Data Analyst,Egypt,Cairo,,,
1030,812d7827-3ba9-4df6-94de-13d7cd3bd23c,Data Analyst,Egypt,Cairo,,,
1215,7025a3c1-94df-4999-a290-a19f9d4c9f36,Data Analyst,Egypt,Cairo,,,
5177,5bec023f-1018-4f8a-b2e6-0ddeb89ab604,Data Analyst,Egypt,Cairo,,,
5291,fd4b0245-fdda-4ea2-8caa-80098bd5aac8,Data Analyst,Egypt,Cairo,,,
5796,70495f45-ce86-454c-8efe-e9d9028d6941,Data Analyst,Egypt,Cairo,,,
5986,f25623c5-aed8-4e17-b46a-24c0fa94512f,Data Analyst,Egypt,Cairo,,,
6167,1e0d2560-b8fd-4cb1-b42b-358c8d7bc858,Data Analyst,Egypt,Cairo,,,


In [35]:
px.histogram(data[(data['country']=='Egypt')& (data['currency']=='EGP')].max_salary)