In [15]:
import pandas as pd

from catboost import Pool, CatBoostRegressor
from sklearn.model_selection import train_test_split

In [41]:
df = pd.read_csv("upwork-jobs.csv")
df.head()

Unnamed: 0,title,link,description,published_date,is_hourly,hourly_low,hourly_high,budget,country
0,Experienced Media Buyer For Solar Pannel and R...,https://www.upwork.com/jobs/Experienced-Media-...,We’re looking for a talented and hardworking a...,2024-02-17 09:09:54+00:00,False,,,500.0,
1,Full Stack Developer,https://www.upwork.com/jobs/Full-Stack-Develop...,Job Title: Full Stack DeveloperWe are seeking ...,2024-02-17 09:09:17+00:00,False,,,1100.0,United States
2,SMMA Bubble App,https://www.upwork.com/jobs/SMMA-Bubble-App_%7...,I need someone to redesign my bubble.io site t...,2024-02-17 09:08:46+00:00,True,10.0,30.0,,United States
3,Talent Hunter Specialized in Marketing,https://www.upwork.com/jobs/Talent-Hunter-Spec...,Join Our Growing Team!We are an innovative com...,2024-02-17 09:08:08+00:00,,,,,United States
4,Data Engineer,https://www.upwork.com/jobs/Data-Engineer_%7E0...,We are looking for a resource who can work par...,2024-02-17 09:07:42+00:00,False,,,650.0,India


In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53058 entries, 0 to 53057
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           53058 non-null  object 
 1   link            53058 non-null  object 
 2   description     53058 non-null  object 
 3   published_date  53058 non-null  object 
 4   is_hourly       44829 non-null  object 
 5   hourly_low      22956 non-null  float64
 6   hourly_high     22161 non-null  float64
 7   budget          21873 non-null  float64
 8   country         51917 non-null  object 
dtypes: float64(3), object(6)
memory usage: 3.6+ MB


In [43]:
df = df[df['title'].apply(len)<120]
df = df[["title", "country", "budget"]]
df = df.dropna()

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 21336 entries, 1 to 53057
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   title    21336 non-null  object 
 1   country  21336 non-null  object 
 2   budget   21336 non-null  float64
dtypes: float64(1), object(2)
memory usage: 666.8+ KB


In [100]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=100)
job_title_vectors = vectorizer.fit_transform(df["title"]).toarray()

X = pd.concat([pd.DataFrame(job_title_vectors), df[["country"]].reset_index(drop=True)], axis=1)
y = df["budget"]

cat_cols = X.select_dtypes(include=['object']).columns
cat_cols_idx = [list(X.columns).index(c) for c in cat_cols]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

train_pool = Pool(X_train, y_train, cat_features=cat_cols_idx)
test_pool = Pool(X_test, y_test, cat_features=cat_cols_idx)

model = CatBoostRegressor(iterations=1000, 
                          depth=6,
                          verbose=0,
                          early_stopping_rounds=100,
                          learning_rate=0.001, 
                          loss_function='RMSE')

model.fit(train_pool, eval_set=test_pool)

<catboost.core.CatBoostRegressor at 0x30972b810>

In [101]:
data = pd.DataFrame({"title": "Middle Python Developer", "country": "United States"}, index=[0])
job_title_vectors = vectorizer.transform(data["title"]).toarray()
predict_data = pd.concat([pd.DataFrame(job_title_vectors), data[["country"]].reset_index(drop=True)], axis=1)
model.predict(predict_data)

array([731.53811227])

In [102]:
data = pd.DataFrame({"title": "Designer", "country": "United States"}, index=[0])
job_title_vectors = vectorizer.transform(data["title"]).toarray()
predict_data = pd.concat([pd.DataFrame(job_title_vectors), data[["country"]].reset_index(drop=True)], axis=1)
model.predict(predict_data)

array([724.28928965])

In [103]:
data = pd.DataFrame({"title": "Designer", "country": "India"}, index=[0])
job_title_vectors = vectorizer.transform(data["title"]).toarray()
predict_data = pd.concat([pd.DataFrame(job_title_vectors), data[["country"]].reset_index(drop=True)], axis=1)
model.predict(predict_data)

array([695.04298323])

In [110]:
model.save_model("price_suggestor")

In [104]:
import pickle

# Save the vectorizer using pickle
with open('tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(vectorizer, file)

In [108]:
with open('tfidf_vectorizer.pkl', 'rb') as file:
    loaded_vectorizer_pickle: TfidfVectorizer = pickle.load(file)

loaded_vectorizer_pickle.transform(["Designer"]).toarray()

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.]])