In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
jobs = pd.read_csv('jobs_dataset.csv')

In [3]:
nan_counts = jobs.isnull().sum()
cols_to_drop = nan_counts[nan_counts > 300].index

In [4]:
def parse_salary(s):
    if pd.isnull(s):
        return pd.Series([None , None ,None , None])
    nums = re.findall(r'\d+' , s)
    if len(nums) >= 2:
        max_sal = int(nums[0])
        min_sal = int(nums[1])
        avg_sal = (max_sal + min_sal) / 2
    else:
        return pd.Series([None , None ,None , None])
    time_unit = None
    if 'a year' in s:
        time_unit = 'year'
    elif 'a month' in s:
        time_unit = 'month'
    elif 'a week' in s:
        time_unit = 'week'
    elif 'an hour' in s:
        time_unit = 'hour'
    return pd.Series([min_sal , max_sal , avg_sal , time_unit])
jobs[['min_salary' , 'max_salary' , 'avg_salary' , 'salary_unit']] = jobs['salary'].apply(parse_salary)

In [5]:
if len(cols_to_drop) > 0:
    jobs = jobs.drop(columns = cols_to_drop)
else:
    print('Nothing!!')
    

In [6]:
jobs = jobs.drop(['jobType/0' , 'externalApplyLink' , 'searchInput/country' , 'url' , 'description' , 'location'] , axis = 1)

In [7]:
target_col = 'max_salary'

for i in range(1 , len(jobs['max_salary']) - 1):
    if pd.isnull(jobs.loc[i , target_col]):
        prev_val = jobs.loc[i - 1 , target_col]
        next_val = jobs.loc[i + 1 , target_col]
        if not pd.isnull(prev_val) and not pd.isnull(next_val):
            jobs.loc[i , target_col] = (prev_val + next_val) / 2

In [8]:
jobs = jobs.dropna()

In [9]:
jobs = jobs.drop('salary' , axis = 1)

In [10]:
from sklearn.preprocessing import LabelEncoder

In [11]:
le = LabelEncoder()
cols_to_encode = ['company' , 'positionName' , 'salary_unit' , 'searchInput/position']
le_dict = {}
for col in cols_to_encode:
    le = LabelEncoder()
    jobs[col + '_encoded'] = le.fit_transform(jobs[col].astype(str))
    le_dict[col] = le

In [12]:
jobs = jobs.drop(['company' , 'positionName' , 'salary_unit' , 'searchInput/position'] , axis = 1)

In [13]:
jobs = jobs.astype('float64')

In [14]:
X = jobs.drop('avg_salary' , axis = 1)
y = jobs['avg_salary']

In [15]:
from sklearn.model_selection import train_test_split as tts
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error ,r2_score , mean_squared_error

In [16]:
X_train , X_test , y_train , y_test = tts(X ,y , test_size = 0.2 ,random_state = 42)

In [17]:
model = RandomForestRegressor(n_estimators = 100 , random_state = 42)

In [18]:
model.fit(X_train , y_train)

In [19]:
y_pred = model.predict(X_test)

In [20]:
mae = mean_absolute_error(y_test , y_pred)
mse = mean_squared_error(y_test , y_pred)
r2 = r2_score(y_test , y_pred)

In [21]:
mse

37.69474823232322

In [22]:
mae

3.0844949494949496

In [23]:
r2

0.9982062119264047

In [24]:
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

In [25]:
r2_train = r2_score(y_train , y_train_pred)
r2_test = r2_score(y_test , y_test_pred)
print('R2 Train:' , r2_train)
print('R2 Test:' , r2_test)

R2 Train: 0.9995008946859332
R2 Test: 0.9982062119264047
