# Setup


In [1]:
from pathlib import Path
import numpy as np
import pandas as pd

pd.options.display.float_format = "{:20,.2f}".format

In [2]:
ROOT_DIR = Path() / ".."
DATA_DIR = ROOT_DIR / "data"
RESULTS_DIR = ROOT_DIR / "results"

In [3]:
x_train_df = pd.read_csv(DATA_DIR / "X_train.csv")
y_train_df = pd.read_csv(DATA_DIR / "Y_train.csv")
df = pd.merge(x_train_df, y_train_df, on="id").drop(columns=["id"])
df["created_at"] = pd.to_datetime(df["created_at"])
df["published_at"] = pd.to_datetime(df["published_at"])
x_test_df = pd.read_csv(DATA_DIR / "X_test.csv")

In [4]:
def describe(df, count=False, missing_info=False):
    desc = df.describe(include="all").T
    if not count:
        desc = desc.drop("count", axis=1)
    if missing_info:
        desc["missing"] = df.isnull().sum()
        desc["missing %"] = desc["missing"] / len(df) * 100
        cols = desc.columns.tolist()
        cols = cols[-2:] + cols[:-2]
        desc = desc[cols]
    print(f"Total rows: {len(df)}")
    return desc

# Prepare features


In [5]:
def convert_gross_to_net(df):
    df_convert = df.copy()
    # if salary_gross = True then salary_from and salary_to are gross, so we convert them to net
    df_convert.loc[df["salary_gross"], ["salary_from"]] *= 0.87
    if "salary_to" in df_convert.columns:
        df_convert.loc[df["salary_gross"], ["salary_to"]] *= 0.87
    df_convert.drop("salary_gross", axis=1, inplace=True)
    return df_convert

In [6]:
def prepare_df(df):
    df_cleared = df.copy()
    # drop published_at, because it always the same as created_at
    df_cleared.drop("published_at", axis=1, inplace=True)
    # drop area_id, because it code for area_name
    df_cleared.drop("area_id", axis=1, inplace=True)
    # drop salary_currency, because it always the same value
    df_cleared.drop("salary_currency", axis=1, inplace=True)
    # fill missing salary_from with median
    df_cleared["salary_from"].fillna(df_cleared["salary_from"].median(), inplace=True)
    # fill missing salary_gross with false
    df_cleared["salary_gross"].fillna(False, inplace=True)
    # fill missing description with empty string
    df_cleared["description"].fillna("", inplace=True)
    # df_cleared = convert_gross_to_net(df_cleared)
    # add datetime features
    df_cleared.drop(["created_at"], axis=1, inplace=True)
    # drop all textual columns
    df_cleared.drop(["name"], axis=1, inplace=True)
    df_cleared.drop(["employer_name"], axis=1, inplace=True)
    df_cleared.drop(["description"], axis=1, inplace=True)
    # mark categorical columns
    df_cleared["area_name"] = df_cleared["area_name"].astype("category")
    # clear index
    df_cleared.reset_index(drop=True, inplace=True)
    return df_cleared

In [7]:
# remove outliers by quantiles of 1% and 99% on salary_to
df_without_outliers = df[
    df["salary_to"].between(
        df["salary_to"].quantile(0.01), df["salary_to"].quantile(0.99)
    )
]
df_prepared = prepare_df(df_without_outliers)
describe(df_prepared, missing_info=True)

Total rows: 27523


Unnamed: 0,missing,missing %,unique,top,freq,mean,std,min,25%,50%,75%,max
has_test,0,0.0,2.0,False,26824.0,,,,,,,
response_letter_required,0,0.0,2.0,False,26456.0,,,,,,,
salary_from,0,0.0,,,,61204.88,43486.91,2.0,35000.0,50000.0,70000.0,395000.0
salary_gross,0,0.0,2.0,False,18563.0,,,,,,,
area_name,0,0.0,156.0,Москва,20176.0,,,,,,,
salary_to,0,0.0,,,,99939.92,75455.45,15000.0,45000.0,70000.0,130000.0,400000.0


# Train


In [32]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import make_scorer

In [9]:
def smape(y_true, y_pred):
    denominator = np.abs(y_true) + np.abs(y_pred)
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0  # Handle division by zero
    return 200 * np.mean(diff)


smape_scorer = make_scorer(smape, greater_is_better=False)

In [10]:
area_name_encoder = OneHotEncoder(sparse=False, handle_unknown="error")
area_name_count = df["area_name"].value_counts().sort_values(ascending=True)
known_area_names = area_name_count[area_name_count >= 100].index


def convert_area_name(df):
    df_convert = df.copy()
    df_convert["area_name"] = df_convert["area_name"].map(
        lambda x: "other" if x not in known_area_names else x
    )
    return df_convert


area_name_encoder.fit(convert_area_name(df[["area_name"]]))



In [11]:
def one_hot_encode(df, column):
    # Transform the column into one-hot encoded features
    encoded_features = area_name_encoder.transform(df[[column]])

    # Create a DataFrame with the encoded features
    columns = area_name_encoder.get_feature_names_out([column])
    encoded_df = pd.DataFrame(encoded_features, columns=columns)

    # Concatenate the encoded DataFrame with the original DataFrame
    df_encoded = pd.concat([df, encoded_df], axis=1)

    # Drop the original column
    df_encoded.drop(column, axis=1, inplace=True)

    return df_encoded

In [12]:
def encode_X(df):
    df_encoded = df.copy()
    df_encoded = convert_area_name(df_encoded)
    df_encoded = one_hot_encode(df_encoded, "area_name")
    # df_encoded = df_encoded.drop(['area_name'], axis=1)
    # conver boolean columns to int
    boolean_columns = df_encoded.columns[df_encoded.dtypes == "bool"]
    for column in boolean_columns:
        df_encoded[column] = df_encoded[column].astype(int)
    return df_encoded

In [13]:
def encode_X(df):
    df_encoded = df.copy()
    df_encoded = convert_area_name(df_encoded)
    df_encoded = one_hot_encode(df_encoded, "area_name")
    boolean_columns = df_encoded.columns[df_encoded.dtypes == "bool"]
    for column in boolean_columns:
        df_encoded[column] = df_encoded[column].astype(int)
    return df_encoded

In [14]:
X = encode_X(df_prepared.drop("salary_to", axis=1))
y = df_prepared["salary_to"]
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.3, random_state=42
)

In [44]:
# models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

rf_param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 50, 100],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    # "Random Forest": GridSearchCV(
    #     estimator=RandomForestRegressor(random_state=42),
    #     param_grid=rf_param_grid,
    #     scoring="neg_mean_squared_error",
    #     cv=5,
    #     n_jobs=-1,
    # )
    "Random Forest": RandomForestRegressor(random_state=42)
}

In [45]:
for model_name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring=smape_scorer)
    mean_smape = -np.mean(scores)
    print(f"{model_name} Mean SMAPE: {mean_smape}")

Linear Regression Mean SMAPE: 33.871448220923526
Decision Tree Mean SMAPE: 29.86564759522572
Random Forest Mean SMAPE: 29.8396898378242


In [46]:
def print_metrics(y_true, y_pred):
    score = smape(y_true, y_pred)
    print(f"SMAPE: {score:.2f}")

In [47]:
model = models["Random Forest"]
model.fit(X_train, y_train)
# Get predictions for validation data
y_pred_valid = model.predict(X_valid)
# Calculate SMAPE on the validation data
print_metrics(y_valid, y_pred_valid)

SMAPE: 30.46


In [49]:
def check_dt():
    model = models["Decision Tree"]
    model.fit(X_train, y_train)
    # Get predictions for validation data
    y_pred_valid = model.predict(X_valid)
    # Calculate SMAPE on the validation data
    print_metrics(y_valid, y_pred_valid)

In [50]:
check_dt()

SMAPE: 30.48


# Predict test data


In [20]:
X_test_prepared = prepare_df(x_test_df)
X_test_prepared_ids = X_test_prepared["id"]
X_test = encode_X(X_test_prepared.drop("id", axis=1))

In [21]:
X_test.describe()

Unnamed: 0,has_test,response_letter_required,salary_from,salary_gross,area_name_other,area_name_Екатеринбург,area_name_Казань,area_name_Краснодар,area_name_Москва,area_name_Нижний Новгород,area_name_Новосибирск,area_name_Санкт-Петербург
count,9312.0,9312.0,9312.0,9312.0,9312.0,9312.0,9312.0,9312.0,9312.0,9312.0,9312.0,9312.0
mean,0.03,0.04,60930.05,0.33,0.04,0.01,0.01,0.01,0.73,0.0,0.01,0.2
std,0.16,0.19,47506.97,0.47,0.21,0.07,0.08,0.07,0.45,0.06,0.09,0.4
min,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,35000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,48000.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
75%,0.0,0.0,70000.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
max,1.0,1.0,1000000.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [22]:
y_test = model.predict(X_test)
predictions = pd.DataFrame({"id": X_test_prepared_ids, "salary_to": y_test})
predictions

Unnamed: 0,id,salary_to
0,20978,52241.40
1,3102,46787.39
2,14731,282663.09
3,16113,237874.23
4,8410,132433.99
...,...,...
9307,29490,68305.55
9308,34298,81844.43
9309,33921,49468.50
9310,8886,68632.48


In [23]:
predictions.to_csv(RESULTS_DIR / "random_forest.csv", index=False)