In [1]:
import pandas as pd

## Подготовка данных

In [2]:
df = pd.read_csv('data/salaries.csv')

In [3]:
df.drop(columns=['median', 'modal'], inplace=True)
df.dropna(inplace=True)

In [4]:
df

Unnamed: 0,parsed_from,year,month,region,mean
0,https://gorodrabot.ru/salary?y=2024&mnt=%D0%BE...,2024,октябрь,омск,61046.0
1,https://gorodrabot.ru/salary?y=2024&mnt=%D0%B4...,2024,декабрь,воронеж,58788.0
2,https://gorodrabot.ru/salary?y=2024&mnt=%D0%BE...,2024,октябрь,воронеж,58788.0
3,https://gorodrabot.ru/salary?y=2023&mnt=%D1%84...,2023,февраль,омск,51550.0
4,https://gorodrabot.ru/salary?y=2023&mnt=%D0%B0...,2023,апрель,воронеж,54240.0
...,...,...,...,...,...
2683,https://gorodrabot.ru/salary?y=2019&mnt=%D0%B0...,2019,август,донецк,33594.0
2684,https://gorodrabot.ru/salary?y=2017&mnt=%D0%B0...,2017,апрель,донецк,23127.0
2685,https://gorodrabot.ru/salary?y=2019&mnt=%D0%B8...,2019,июнь,ростов-на-дону,35343.0
2686,https://gorodrabot.ru/salary?y=2019&mnt=%D0%BE...,2019,октябрь,донецк,31141.0


In [5]:
month_mapping = {
    "январь": 1,
    "февраль": 2,
    "март": 3,
    "апрель": 4,
    "май": 5,
    "июнь": 6,
    "июль": 7,
    "август": 8,
    "сентябрь": 9,
    "октябрь": 10,
    "ноябрь": 11,
    "декабрь": 12
}
min_year_by_region = df.groupby('region')['year'].min().reset_index()

def get_min_year(region: str) -> int:
    return min_year_by_region[min_year_by_region['region'] == region]['year'].values[0]

def get_month_number(month: str) -> int:
    return month_mapping[month]

def create_period(row) -> int:
    min_year = get_min_year(row["region"])
    month_number = get_month_number(row["month"])
    year = row["year"]
    
    return (year - min_year) * 12 + month_number

df['period'] = df.apply(create_period, axis=1)

## Задание 1
#### Коэффициенты линейной регрессии для каждого региона
- t - Период
- y - Значение
- n - Количество периодов

In [6]:
regions = df["region"].unique()
data = []

for region in regions:
    region_df = df[df["region"] == region]

    n = region_df.shape[0]
    period_sum_value = region_df['period'].sum()
    value_sum_value = region_df['mean'].sum()
    period_powered_sum_value = (region_df['period'] ** 2).sum()
    period_to_value_sum_value = (region_df['period'] * region_df['mean']).sum()
    a_1 = (
        (n * period_to_value_sum_value - period_sum_value * value_sum_value)
        / (n * period_powered_sum_value - period_sum_value ** 2)
    )
    a_0 = (
        (value_sum_value - a_1 * period_sum_value)
        / n
    )
    data.append([
        region,
        a_1,
        a_0,
    ])


In [7]:
linear_regression_df = pd.DataFrame(data, columns=['region', 'a_1', 'a_0'])
linear_regression_df

Unnamed: 0,region,a_1,a_0
0,омск,411.294845,21321.158333
1,воронеж,358.294784,25159.400877
2,пермь,388.044716,23207.216667
3,волгоград,395.122762,21744.400219
4,донецк,655.225807,14120.433772
5,саратов,405.669093,21927.642763
6,тюмень,433.657983,25151.889912
7,тольятти,425.417241,17324.659649
8,санкт-петербург,387.382298,31645.937719
9,барнаул,390.866902,19276.580263


#### Предсказание зарплаты для каждого региона на n периодов

In [8]:
df['predicted'] = False

def get_linear_regression_prediction(region: str, period: int) -> float:
    row = linear_regression_df[linear_regression_df['region'] == region].iloc[0]
    return row['a_1'] * period + row['a_0']


In [9]:
future_periods = 3

for region in regions:
    region_df = df[df["region"] == region]
    last_period = region_df['period'].max()
    last_period_row = region_df[region_df['period'] == last_period].iloc[0]

    for i in range(1, future_periods + 1):
        new_month_number = get_month_number(last_period_row["month"]) + i
        new_year = last_period_row["year"] + new_month_number // 12
        new_month = list(month_mapping.keys())[new_month_number % 12]
        new_period = create_period(dict(region=last_period_row["region"], year=new_year, month=new_month))
        mean = get_linear_regression_prediction(region, new_period)
        

        new_row = dict(
            region=region,
            year=new_year,
            month=new_month,
            period=new_period,
            mean=mean,
            predicted=True
        )
        
        df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
            

In [10]:
from matplotlib import pyplot as plt
import seaborn as sns

from datetime import date

In [None]:
for region in regions[5:100]:
    region_df = df[df["region"] == region]
    region_df = region_df.sort_values(by='period')

    region_df['date'] = region_df.apply(lambda row: date(row['year'], get_month_number(row['month']), 1), axis=1)
    region_df['predicted_value'] = region_df.apply(lambda row: get_linear_regression_prediction(region, row['period']), axis=1)

    plt.figure(figsize=(16, 8))

    real_data = region_df[region_df['predicted'] == False]
    predicted_data = region_df[region_df['predicted'] == True]

    sns.lineplot(
        data=real_data,
        x='date',
        y='mean',
        marker='o',
        markersize=10,
        linestyle='',
        color='blue',
        label='Реальные данные',
    )
    sns.lineplot(
        data=predicted_data,
        x='date',
        y='mean',
        marker='D',
        markersize=10,
        linestyle='',
        color='red',
        label='Прогноз',
    )
    sns.lineplot(
        data=region_df,
        x='date',
        y='predicted_value',
        color='lime',
        linewidth=0.8,
        label='Тренд',
    )

    plt.title(f"Зарплаты в регионе {region.capitalize()}", fontsize=18, weight='bold')
    plt.xlabel("Дата", fontsize=14)
    plt.ylabel("Зарплата", fontsize=14)

    plt.xticks(rotation=45, fontsize=12)
    plt.yticks(fontsize=12)
    
    plt.grid(True, color='gray', linestyle='--', linewidth=0.5)
    plt.legend(fontsize=12)

    plt.savefig(f"results/linear_regression/{region}.png")