In [2]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [3]:
train = pd.read_parquet("train.parquet") 
test = pd.read_parquet("test.parquet.parquet")
meta_data = pd.read_parquet("campaigns_meta.parquet.parquet")
categories = pd.read_parquet("categories.parquet.csv.parquet")

In [11]:
train['user_id'].nunique()

3263615

In [4]:
categories.head()

Unnamed: 0,microcat_id,level_id,parent_microcat_id,logcat_id,vertical_id,category_id
0,33482,7.0,40172.0,54.0,3.0,3.0
1,27254,5.0,48637.0,55.0,5.0,4.0
2,37005,6.0,15332.0,54.0,3.0,3.0
3,31376,8.0,28137.0,58.0,8.0,4.0
4,20493,4.0,18343.0,24.0,8.0,9.0


In [5]:
user_ads_clicks = train.groupby(["user_id", "adv_campaign_id"], as_index=False)["target"].max()

## Экспозиция

In [4]:
meta_data['start_date'] = pd.to_datetime(meta_data['start_date'])
meta_data['end_date'] = pd.to_datetime(meta_data['end_date'])


In [5]:
print(meta_data['start_date'].min())
print(meta_data['end_date'].max())

2024-05-13 00:00:00
2024-10-21 00:00:00


In [6]:
months = pd.date_range('2024-05-01', '2024-10-01', freq='MS').strftime('%Y-%m').tolist()

def calculate_exposure(row, month):
    start_date = row['start_date']
    end_date = row['end_date']
    
    # Вычисляем начало и конец месяца
    month_start = pd.to_datetime(month + "-01")
    next_month_start = month_start + pd.DateOffset(months=1)
    month_end = next_month_start - pd.Timedelta(days=1)

    # Определяем пересечение интервала компании с месяцем
    overlap_start = max(start_date, month_start)
    overlap_end = min(end_date, month_end)
    
    # Если пересечение есть
    if overlap_start <= overlap_end:
        return (overlap_end - overlap_start).days / (month_end - month_start).days
    else:
        return 0

for month in months:
    meta_data[month] = meta_data.apply(lambda row: calculate_exposure(row, month), axis=1)

meta_data.head()

Unnamed: 0,adv_campaign_id,start_date,end_date,goal_cost,goal_budget,location_id,logcat_id,2024-05,2024-06,2024-07,2024-08,2024-09,2024-10
0,2153,2024-09-21,2024-10-02,6.661659,9429.056096,70,59,0.0,0.0,0.0,0.0,0.310345,0.033333
1,3103,2024-09-10,2024-09-16,2.853378,3844.482933,30,40,0.0,0.0,0.0,0.0,0.206897,0.0
2,2816,2024-09-10,2024-09-17,3.05823,1455.156612,56,65,0.0,0.0,0.0,0.0,0.241379,0.0
3,3603,2024-09-10,2024-09-16,4.395015,2592.232475,30,50,0.0,0.0,0.0,0.0,0.206897,0.0
4,1328,2024-09-10,2024-09-16,3.891329,2836.139672,30,51,0.0,0.0,0.0,0.0,0.206897,0.0


## One Hot Encoding + Экспозиция

In [7]:
encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded_data = encoder.fit_transform(meta_data[['location_id', 'logcat_id']])
encoded_columns = encoder.get_feature_names_out(['location_id', 'logcat_id'])

encoded_df = pd.DataFrame(encoded_data, columns=encoded_columns, index=meta_data.index)
meta_data_encoded = pd.concat([meta_data, encoded_df], axis=1)
meta_data_encoded.drop(columns=['location_id', 'logcat_id', 'start_date', 'end_date'], inplace=True)

meta_data_encoded.head()


Unnamed: 0,adv_campaign_id,goal_cost,goal_budget,2024-05,2024-06,2024-07,2024-08,2024-09,2024-10,location_id_2,...,logcat_id_48,logcat_id_50,logcat_id_51,logcat_id_56,logcat_id_57,logcat_id_59,logcat_id_63,logcat_id_64,logcat_id_65,logcat_id_66
0,2153,6.661659,9429.056096,0.0,0.0,0.0,0.0,0.310345,0.033333,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,3103,2.853378,3844.482933,0.0,0.0,0.0,0.0,0.206897,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2816,3.05823,1455.156612,0.0,0.0,0.0,0.0,0.241379,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,3603,4.395015,2592.232475,0.0,0.0,0.0,0.0,0.206897,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1328,3.891329,2836.139672,0.0,0.0,0.0,0.0,0.206897,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Пространство User

In [8]:
campaign_stats = train.groupby(['user_id', 'adv_campaign_id']).agg(
    impressions=('target', 'size'),  # количество показов (всего записей для пары user_id и adv_campaign_id)
    clicks=('target', 'sum')         # количество кликов (сумма target, где 1 - это клик)
).reset_index()

# Вычисляем CTR для каждой пары (user_id, adv_campaign_id)
campaign_stats['CTR'] = campaign_stats['clicks'] / campaign_stats['impressions']

campaign_stats.head()


Unnamed: 0,user_id,adv_campaign_id,impressions,clicks,CTR
0,1,2,2,0,0.0
1,1,41,1,0,0.0
2,1,57,1,0,0.0
3,1,70,3,0,0.0
4,1,72,2,0,0.0


In [12]:
campaign_stats.shape

(72533686, 5)

In [9]:
chunk_size = 10000 
result_chunks = []

# Читаем и разбиваем данные на порции
for i in range(0, len(campaign_stats), chunk_size):
    # Извлекаем порцию данных
    chunk = campaign_stats.iloc[i:i + chunk_size].copy()
    
    # Объединяем с meta_data_encoded
    chunk = chunk.merge(meta_data_encoded, on='adv_campaign_id')
    features = meta_data_encoded.drop(columns=['adv_campaign_id']).columns
    
    # Умножаем фичи на CTR
    chunk[features] = chunk[features].multiply(chunk['CTR'], axis=0)
    
    # Группируем по user_id и суммируем фичи
    weighted_chunk = chunk.groupby('user_id')[features].sum()
    
    # Добавляем результат в список
    result_chunks.append(weighted_chunk)

# Объединяем все обработанные порции в один DataFrame
weighted_features = pd.concat(result_chunks).groupby('user_id').sum().reset_index()

weighted_features.head()

Unnamed: 0,user_id,goal_cost,goal_budget,2024-05,2024-06,2024-07,2024-08,2024-09,2024-10,location_id_2,...,logcat_id_48,logcat_id_50,logcat_id_51,logcat_id_56,logcat_id_57,logcat_id_59,logcat_id_63,logcat_id_64,logcat_id_65,logcat_id_66
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
weighted_features.query('goal_budget>0')

Unnamed: 0,user_id,goal_cost,goal_budget,2024-05,2024-06,2024-07,2024-08,2024-09,2024-10,location_id_2,...,logcat_id_48,logcat_id_50,logcat_id_51,logcat_id_56,logcat_id_57,logcat_id_59,logcat_id_63,logcat_id_64,logcat_id_65,logcat_id_66
10,11,7.518995,6832.416626,0.0,0.0,0.0,0.000000,0.413793,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
16,17,3.266716,23036.878906,0.0,0.0,0.0,0.000000,0.241379,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
22,23,1.842519,10380.872070,0.0,0.0,0.0,0.000000,0.137931,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
57,58,3.382504,1526.442505,0.0,0.0,0.0,0.000000,0.206897,0.0,0.0,...,0.0,1.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
68,69,4.552533,8606.907227,0.0,0.0,0.0,0.000000,0.448276,0.0,0.0,...,0.0,0.000000,1.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3263589,3263597,3.212202,11882.625326,0.0,0.0,0.0,0.000000,0.155172,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
3263593,3263601,6.230429,8425.365016,0.0,0.0,0.0,0.000000,0.300055,0.0,0.0,...,0.1,0.111111,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0
3263599,3263607,9.213250,24435.951172,0.0,0.0,0.0,0.266667,0.689655,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
3263608,3263616,1.289778,1789.360229,0.0,0.0,0.0,0.016667,0.068966,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In [13]:
weighted_features.to_csv("/Users/v.s.lavrentev/Documents/Прога/user.csv", index=False)

In [14]:
meta_data_encoded.to_csv("/Users/v.s.lavrentev/Documents/Прога/meta_data_encoded.csv", index=False)

## Матрица косинусной схожести

u - матрица с фичами по юзерам 

a - матрица с фичами по компаниям 

u' - вектор-столбец норм для юзеров 

a'- вектор-строка норм для реклам 


$F = U \cdot A^T$

F / = u' (поэлементно)

F /= a' (поэлементно)
