In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, Binarizer, OneHotEncoder, OrdinalEncoder
from scipy.stats import pearsonr, spearmanr, chi2_contingency, pointbiserialr, f_oneway

from sklearn.impute import SimpleImputer

pd.set_option("display.max_columns", None) # to display all columns in a dataframe

In [2]:
df = pd.read_csv('Dataset/Final_Upwork_Dataset.csv')

In [3]:
df.describe()

Unnamed: 0,Freelancers_Num,Spent($),Connects_Num,New_Connects_Num,Rating,Feedback_Num
count,63950.0,62304.0,63949.0,63949.0,30593.0,63949.0
mean,1.137654,29997.89,4.506701,9.013401,4.823875,20.997029
std,2.391479,392651.4,2.097518,4.195035,0.418398,180.450288
min,0.0,0.0,1.0,2.0,1.0,0.0
25%,1.0,0.0,2.0,4.0,4.842153,0.0
50%,1.0,80.0,4.0,8.0,4.975717,0.0
75%,1.0,4000.0,6.0,12.0,5.0,10.0
max,99.0,70000000.0,8.0,16.0,5.0,12541.0


In [4]:
drop_urls = ['Category1_URL_search','Category2_URL_search','Category3_URL_search','Category4_URL_search','Category5_URL_search','Category6_URL_search' ,'Category7_URL_search','Category8_URL_search','Category9_URL_search']

missing = df.isnull().sum()
print(missing)

Job Title                   2
Job_URL                     1
EX_level_demand          7015
Time_Limitation         40916
Search_Keyword              1
Posted_from                37
Description                 2
Category1_URL_search      445
Category_1                445
highlight               13819
Category2_URL_search     4061
Category_2               4061
Category3_URL_search     8953
Category_3               8953
Category4_URL_search    16407
Category_4              16407
Category5_URL_search    24372
Category_5              24372
Category6_URL_search    32643
Category_6              32643
Category7_URL_search    38758
Category_7              38758
Category8_URL_search    43514
Category_8              43514
Category9_URL_search    47469
Category_9              47469
Applicants_Num              1
Payment_Situation           1
Enterprise_Client       63897
Freelancers_Num             0
Spent($)                 1646
Client_Country            125
Connects_Num                1
New_Connec

In [5]:
df_cleaned = df.drop(['Category1_URL_search','Category2_URL_search','Category3_URL_search','Category4_URL_search','Category5_URL_search','Category6_URL_search' ,'Category7_URL_search','Category8_URL_search','Category9_URL_search'], axis=1)

In [6]:
missing = df_cleaned.isnull().sum()
print(missing)

Job Title                2
Job_URL                  1
EX_level_demand       7015
Time_Limitation      40916
Search_Keyword           1
Posted_from             37
Description              2
Category_1             445
highlight            13819
Category_2            4061
Category_3            8953
Category_4           16407
Category_5           24372
Category_6           32643
Category_7           38758
Category_8           43514
Category_9           47469
Applicants_Num           1
Payment_Situation        1
Enterprise_Client    63897
Freelancers_Num          0
Spent($)              1646
Client_Country         125
Connects_Num             1
New_Connects_Num         1
Rating               33357
Feedback_Num             1
Payment_type             1
Job_Cost             43837
Hourly_Rate          37380
Start_rate               1
End_rate             37987
dtype: int64


In [8]:
print(df_cleaned['highlight'])

0                          data
1            Google Data Studio
2                           NaN
3                           NaN
4                          Data
                  ...          
63945                 Marketing
63946                 Marketing
63947    Social Media Marketing
63948    Social Media Marketing
63949                       NaN
Name: highlight, Length: 63950, dtype: object


In [8]:
df_cleaned_new = df_cleaned.drop(['Connects_Num','Posted_from','highlight','Job_URL','Category_4','Category_5','Category_6','Category_7','Category_8','Category_9','Enterprise_Client','Rating','Time_Limitation'], axis=1)

In [9]:
df_cleaned_new.head(10)

Unnamed: 0,Job Title,EX_level_demand,Search_Keyword,Description,Category_1,Category_2,Category_3,Applicants_Num,Payment_Situation,Freelancers_Num,Spent($),Client_Country,New_Connects_Num,Feedback_Num,Payment_type,Job_Cost,Hourly_Rate,Start_rate,End_rate
0,Power bi specialist freelance,Expert,Data_science,Already data pooled and designed. Need to refi...,Data Analysis,Data Visualization,Microsoft Power BI,Less than 5,Payment unverified,1,0.0,United States,12.0,0.0,Fixed-price,"$2,500",,0,
1,Case Study (on-demand delivery startup),Intermediate,Data_science,"Hi,\n\nWould you be able to help me do a case-...",Google Data Studio,SQL,Tableau,Less than 5,Payment verified,1,100.0,Canada,8.0,1.0,Fixed-price,$200,,0,
2,"File Maker Pro Reports, Charts, Query and Ongo...",Intermediate,Data_science,NITIAL PROJECT\n\nSet up Monthly Report mimick...,Report Writing,Custom Graphics,,Less than 5,Payment verified,1,200.0,United States,12.0,1.0,Hourly,,$40.00,40,
3,Implementation of EleutherAI/gpt-neox-20b,Expert,Data_science,"As a first step, you will implement the instal...",Machine Learning Model,Machine Learning,Python,Less than 5,Payment verified,1,200000.0,Canada,12.0,26.0,Hourly,,$35.00-$100.00,$35.00,$100.00
4,BI and Data Engineer for Upwork Finance System...,Expert,Data_science,The Upwork Finance Systems team is looking for...,Data Analysis,Looker,SQL,Less than 5,Payment verified,1,,United States,12.0,12512.0,Hourly,,,0,
5,Computer vision / machine learning: synthetic ...,Expert,Data_science,Our company is working on a government proposa...,Generative Adversarial Network,Machine Learning,Computer Vision,Less than 5,Payment verified,1,100000.0,United States,12.0,11.0,Hourly,,$75.00-$175.00,$75.00,$175.00
6,Expert in Data Analytics,Expert,Data_science,Mission : Ensure smooth transition of Analytic...,Data Visualization,Data Analysis,Microsoft Power BI,Less than 5,Payment verified,1,70000000.0,United States,12.0,10906.0,Hourly,,,0,
7,Data Analytics Expert for Educational Exercise,Intermediate,Data_science,Looking for someone to help me solve some prob...,Data Analytics,Big Data,Data Visualization,Less than 5,Payment verified,1,600.0,Australia,4.0,6.0,Fixed-price,$10,,0,
8,Azure Data Factory specialist,Intermediate,Data_science,1. Good work experience in Azure Data Factory ...,SQL,Microsoft Azure,ETL,Less than 5,Payment verified,1,0.0,Poland,12.0,0.0,Hourly,,$25.00-$35.00,$25.00,$35.00
9,Implementation of algorithm for multilingual data,Expert,Data_science,Looking for freelancer to implement following ...,Data Science,Algorithm Development,Python,Less than 5,Payment verified,1,20000.0,Pakistan,4.0,68.0,Hourly,,,0,


In [10]:
print(df_cleaned_new.isnull().sum())

Job Title                2
EX_level_demand       7015
Search_Keyword           1
Description              2
Category_1             445
Category_2            4061
Category_3            8953
Applicants_Num           1
Payment_Situation        1
Freelancers_Num          0
Spent($)              1646
Client_Country         125
New_Connects_Num         1
Feedback_Num             1
Payment_type             1
Job_Cost             43837
Hourly_Rate          37380
Start_rate               1
End_rate             37987
dtype: int64


In [19]:


miss_hour = df_cleaned_new['Hourly_Rate'].isnull() 
miss_fixed = df_cleaned_new['Job_Cost'].isnull() 

miss_fixed.sum()
   

np.int64(43837)