# Prepare data

In [10]:
import pandas as pd

In [11]:
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv')

train_data = X_train.merge(y_train, on='id', how='outer')

In [12]:
for column_name in X_train.columns:
    print(f'{column_name}: ', len(pd.unique(X_train[column_name])))
    
print()

for column_name in X_test.columns:
    print(f'{column_name}: ', len(pd.unique(X_test[column_name])))

id:  27934
name:  16395
has_test:  2
response_letter_required:  2
salary_from:  479
salary_currency:  1
salary_gross:  3
published_at:  27173
created_at:  27173
employer_name:  13318
description:  23682
area_id:  156
area_name:  156

id:  9312
name:  6275
has_test:  2
response_letter_required:  2
salary_from:  285
salary_currency:  1
salary_gross:  3
published_at:  9195
created_at:  9195
employer_name:  6025
description:  8394
area_id:  118
area_name:  118


In [13]:
has_nan = train_data.isna().sum()
print("Nans in train\n", has_nan)

has_nan = X_test.isna().sum()
print("\nNans in test\n", has_nan)

Nans in train
 id                             0
name                           0
has_test                       0
response_letter_required       0
salary_from                 4032
salary_currency                0
salary_gross                 148
published_at                   0
created_at                     0
employer_name                  0
description                    1
area_id                        0
area_name                      0
salary_to                      0
dtype: int64

Nans in test
 id                             0
name                           0
has_test                       0
response_letter_required       0
salary_from                 1388
salary_currency                0
salary_gross                  49
published_at                   0
created_at                     0
employer_name                  0
description                    0
area_id                        0
area_name                      0
dtype: int64


## Remove extra columns

In [14]:
train_data = train_data.drop(columns=['salary_currency', 'published_at', 'created_at', 'area_name'])
X_test = X_test.drop(columns=['salary_currency', 'published_at', 'created_at', 'area_name'])

## Remove rows with empty description

In [15]:
idxs = train_data[train_data['description'].isna()].index.tolist()
print(idxs)

train_data = train_data.drop(idxs)

[12194]


# Process text fields with NLP

Extracting lemmas from text columns

In [9]:
from bs4 import BeautifulSoup
import spacy

num = 0

nlp_ru = spacy.load("ru_core_news_lg")
nlp_en = spacy.load('en_core_web_lg')  

def preprocess_name(text):
    global num
    string = text.replace("-", " ").replace("/", " ").replace(".", "")
    doc = nlp_ru(string)       
    ret = ' '.join([token.lemma_.lower() for token in doc if not token.is_punct and not token.is_space])
    if num % 3000 == 0:
        print("preprocess_name:", num)

    num += 1
    return ret

def preprocess_description(text):
    global num
    soup = BeautifulSoup(text, "html.parser")
    strings = [string.text.strip() for string in soup.strings]
    strings = [string for string in strings if string != '']
    tokens = []
    for doc in nlp_ru.pipe(strings):
       proj_tok = ' '.join([token.lemma_.lower() for token in doc if not token.is_stop and token.is_alpha])
       tokens.append(proj_tok)
    ret = ' '.join(tokens)

    if ret.strip() == '':
        for doc in nlp_en.pipe(strings):
           proj_tok = ' '.join([token.lemma_.lower() for token in doc if not token.is_stop and token.is_alpha])
           tokens.append(proj_tok)
        ret = ' '.join(tokens)

    if num % 3000 == 0:
        print("preprocess_description:", num)

    num += 1
    return ret

def preprocess_employer_name(string):
    global num

    doc = nlp_ru(string)       
    ret = ' '.join([token.lemma_.lower() for token in doc if not token.is_punct])

    if num % 3000 == 0:
        print("preprocess_employer_name:", num)

    num += 1
    
    return ret
    
# X_test = X_test[:105]
X_test['name'] = X_test['name'].apply(preprocess_name)
num = 0
X_test['description'] = X_test['description'].apply(preprocess_description)
num = 0
X_test['employer_name'] = X_test['employer_name'].apply(preprocess_employer_name)
num = 0

# train_data = train_data[:105]
train_data['name'] = train_data['name'].apply(preprocess_name)
num = 0
train_data['description'] = train_data['description'].apply(preprocess_description)
num = 0
train_data['employer_name'] = train_data['employer_name'].apply(preprocess_employer_name)
num = 0

preprocess_name: 0
preprocess_name: 3000
preprocess_name: 6000
preprocess_name: 9000
preprocess_description: 0
preprocess_description: 3000
preprocess_description: 6000
preprocess_description: 9000
preprocess_employer_name: 0
preprocess_employer_name: 3000
preprocess_employer_name: 6000
preprocess_employer_name: 9000
preprocess_name: 0
preprocess_name: 3000
preprocess_name: 6000
preprocess_name: 9000
preprocess_name: 12000
preprocess_name: 15000
preprocess_name: 18000
preprocess_name: 21000
preprocess_name: 24000
preprocess_name: 27000
preprocess_description: 0
preprocess_description: 3000
preprocess_description: 6000
preprocess_description: 9000
preprocess_description: 12000
preprocess_description: 15000
preprocess_description: 18000
preprocess_description: 21000
preprocess_description: 24000
preprocess_description: 27000
preprocess_employer_name: 0
preprocess_employer_name: 3000
preprocess_employer_name: 6000
preprocess_employer_name: 9000
preprocess_employer_name: 12000
preprocess_e

# Process binary and number columns

In [36]:
def convert_to_binary(column):
    col = column.fillna(column.mode().iloc[0])
    return col.apply(lambda x: 1 if x == True else 0)

binary_columns = ['has_test', 'response_letter_required', 'salary_gross']
for column in binary_columns:
    train_data[column] = convert_to_binary(train_data[column])
    X_test[column] = convert_to_binary(X_test[column])

median_salary_from = train_data['salary_from'].median()
train_data['salary_from'].fillna(median_salary_from, inplace=True)

median_salary_from = X_test['salary_from'].median()
X_test['salary_from'].fillna(median_salary_from, inplace=True)

In [37]:
has_nan = train_data.isna().sum()
print("Nans in train\n", has_nan)

has_nan = X_test.isna().sum()
print("\nNans in train\n", has_nan)

Nans in train
 name                        0
has_test                    0
response_letter_required    0
salary_from                 0
salary_gross                0
employer_name               0
description                 0
area_id                     0
salary_to                   0
dtype: int64

Nans in train
 name                        0
has_test                    0
response_letter_required    0
salary_from                 0
salary_gross                0
employer_name               0
description                 0
area_id                     0
dtype: int64


# Turn text columns into features

In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=2500, stop_words='english')

text_columns = ['name', 'employer_name', 'description']

for column in text_columns:
    tfidf_vectorizer.fit(train_data[column])
    
# Replace the text column with its TF-IDF representation

for column in text_columns:
    train_data_tfidf = tfidf_vectorizer.transform(train_data[column])
    train_data_tfidf_df = pd.DataFrame(train_data_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
    train_data = pd.concat([train_data.drop(column, axis=1), train_data_tfidf_df], axis=1)

for column in text_columns:
    X_test_tfidf = tfidf_vectorizer.transform(X_test[column])

    X_test_tfidf_df = pd.DataFrame(X_test_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
    X_test = pd.concat([X_test.drop(column, axis=1), X_test_tfidf_df], axis=1)

# de-duplicate columns
train_data = train_data.groupby(level=0, axis=1).max()
X_test = X_test.groupby(level=0, axis=1).max()

train_data.to_csv('processed_train_data.csv', index=False)
X_test.to_csv('processed_X_test.csv', index=False)

  train_data = train_data.groupby(level=0, axis=1).max()
  X_test = X_test.groupby(level=0, axis=1).max()


# Split dataset

In [39]:
from sklearn.model_selection import train_test_split
import pandas as pd

train_data = pd.read_csv('processed_train_data.csv')

X = train_data.drop(columns=['id', 'salary_to'], axis=1)
y = train_data['salary_to']

Xtrain, Xval, ytrain, yval = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model

In [2]:
from xgboost import XGBRegressor

model = XGBRegressor(
    booster = 'gbtree',
    colsample_bylevel = 0.5,
    colsample_bytree = 0.7,
    gamma = 0.0,
    learning_rate = 0.05,
    max_depth = 18,
    n_estimators = 6000,
    n_jobs = -1,
    seed = 42,
    random_state=42,
    eval_metric = 'rmse')

model.fit(
          X, y, 
          # Xtrain, ytrain, 
          verbose = 100,
          eval_set = [(Xtrain, ytrain), (Xval, yval)])

[0]	validation_0-rmse:101762.70131	validation_1-rmse:87409.63984
[100]	validation_0-rmse:15619.19567	validation_1-rmse:15260.74576
[200]	validation_0-rmse:10529.45478	validation_1-rmse:10609.54099
[300]	validation_0-rmse:8599.86000	validation_1-rmse:8785.30411
[400]	validation_0-rmse:7352.06860	validation_1-rmse:7456.62524
[500]	validation_0-rmse:6390.51848	validation_1-rmse:6477.59401
[600]	validation_0-rmse:5629.89710	validation_1-rmse:5675.12337
[700]	validation_0-rmse:5037.49919	validation_1-rmse:5005.15738
[800]	validation_0-rmse:4578.63575	validation_1-rmse:4510.11208
[900]	validation_0-rmse:4168.31762	validation_1-rmse:4116.13595
[1000]	validation_0-rmse:3829.65038	validation_1-rmse:3720.25650
[1100]	validation_0-rmse:3530.22942	validation_1-rmse:3412.46360
[1200]	validation_0-rmse:3271.16352	validation_1-rmse:3168.75982
[1300]	validation_0-rmse:3052.27853	validation_1-rmse:2930.24174
[1400]	validation_0-rmse:2850.82053	validation_1-rmse:2723.20850
[1500]	validation_0-rmse:2703.

# Evaluate prediction

In [None]:
y_pred = model.predict(Xval)

def smape(actual, predicted):
    denominator = (abs(actual) + abs(predicted)) / 2.0
    diff = abs(actual - predicted) / denominator
    diff[denominator == 0] = 0.0
    return 100 * diff.sum() / len(actual)

smape_score = smape(yval, y_pred)
print(f'SMAPE: {smape_score:.2f}%')

# Get results

In [4]:
import pandas as pd

X_test = pd.read_csv('processed_X_test.csv')
test_predictions = model.predict(X_test.drop(columns=['id']))

submission_df = pd.DataFrame({'id': X_test['id'].astype(int), 'salary_to': test_predictions.round(2)})
submission_df.to_csv('submission.csv', index=False)