Мы случайно пока собирали наш бардак в файлы train.ipynb и test.ipynb видимо где-то открутили/прикрутили фич (потому что код из разных файлов кидали) и итоговая точность на public вместо 97.63 (до 23:59:59 25.10.2025) стала 97.65 (посылка в 00:52 26.10.2025). Старую модель уже не знаем как восстановить (мы сами не в курсе какие там были настройки). В общем надеемся, что из-за этой "оплошности" не будет нюансов и наше положение в итоговой таблице не изменится в обоих посылках. Немного мы опоздали с превращением наших файлов в итоговый формат, приносим извинения!

С уважением, команда rucode-2025-ai-0633

In [1]:
import numpy as np
import pandas as pd

from sklearn.base import TransformerMixin, BaseEstimator
import joblib

In [2]:
test_csv = pd.read_csv('interactions_private_test.csv')
items = pd.read_csv('items.csv')
users = pd.read_csv('users.csv')

In [3]:
test = test_csv.copy(deep=True)
test = test.merge(items, on='item_id', how='left')
test = test.merge(users, on='user_id', how='left')

# Определения функций для выгрузки моделей и пайплайна

In [4]:
def drop_columns(X, cols):
  X = X.copy()

  X = X.drop(cols, axis=1)
  return X

def cosine_lr(iter, max_iter=1000):
  min_lr = 0.0005
  max_lr = 0.1
  lr = min_lr + .5 * (max_lr - min_lr) * (1 + np.cos((iter / max_iter) * np.pi))

  return lr

def preprocessing_int_feature(X):
  X = X.copy()

  X['film_age'] = (X['release_year'].max() - X['release_year'])

  X['age_bin'] = pd.qcut(X['film_age'],
                         q=3,
                         labels=['new', 'mid_age_film', 'old'],
                         duplicates='drop')

  X['total_dur']= np.log1p(X['total_dur'] / 60000)
  X['total_dur_bin'] = pd.qcut(X['total_dur'],
                              q = 5, 
                              labels=['very_short', 'short', 'mid', 'long', 'very_long'],
                              duplicates='drop')


  X['totalD_ageR_Ratio'] = X['total_dur']/(X['age_rating'] + 1) 
  X['film_age'] = (X['release_year'].max() - X['release_year']) 
  X['is_old_small_watch'] = (X['total_dur_bin'].isin(['short', 'very_short']) & X['last_d_count_watch'] < 7)
  X['is_long_with_pop_actor'] = (X['total_dur_bin'].isin(['long', 'very_log']) & X['actors'] > X['actors'].quantile(.75)) 
  X['potential_power'] = X['actors'] * X['directors']



  for col in ['total_dur_bin', 'age_bin']:
    dummies = pd.get_dummies(X[col])
    X = pd.concat([X, dummies], axis=1)
    X = X.drop(col, axis=1)

  return X

def preprocessing_cat_feature(X):
  X = X.copy()

  X['content_type'] = X['content_type'].map({'film':1, 'series':0})
  X['sex'] = X['sex'].map({'Ж' : 1, 'М': 0, 'UNKNOWN':-1})

  age_map = {'age_35_44' : 40,
             'age_25_34' : 29,
             'age_55_64' : 60,
             'age_45_54' : 49,
             'age_18_24' : 21,
             'age_65_inf' : 65,
             'UNKNOWN' : 0}

  X['age'] = X['age'].map(age_map)

  income_map = {'income_0_20' : 10,
                'income_20_40' : 30,
                'income_40_60' : 50,
                'income_60_90' : 75,
                'income_90_150' : 120,
                'income_150_inf' : 120,
                'UNKNOWN' : 0}

  X['income'] = X['income'].map(income_map)


  max_dt = pd.to_datetime(X['last_watch_dt']).dt.normalize().max()
  X['last_watch_dt'] = pd.to_datetime(X['last_watch_dt']).dt.normalize()
  X['last_d_count_watch'] = (max_dt - X['last_watch_dt']).dt.days.astype(np.int64)
  X = X.drop('last_watch_dt', axis=1)

  X['exp_down_lst_day_wtch'] = np.exp(-X['last_d_count_watch'] / 30)

  return X

class FichaEncode(BaseEstimator, TransformerMixin):
  def __init__(self, cols=['actors']):
    self.cols = cols
    self.map_cols = { col:{} for i,col in enumerate(self.cols)}

  def fit(self, X, y=None):

    for col in self.cols:
      for members in X[col]:

        if members is np.nan:
          continue

        members = members.split(', ')
        for member in members:
          self.map_cols[col][member] = self.map_cols[col].get(member, 0) + 1

    return self

  def transform(self, X):


    for col in self.cols:
      col_lst = X[col].str.split(', ')
      sums = col_lst.apply(lambda actors: sum(self.map_cols[col].get(actor, 0) for actor in actors))
      X[col] = np.log1p(sums)

    return X


  def set_output(self, transform=None):
    return self


In [6]:
loaded_weights = joblib.load('model_weights.joblib')

model_xgb_loaded = loaded_weights['model_xgb']
model_lgbm_loaded = loaded_weights['model_lgbm']
preprocessing = loaded_weights['processing_pipeline'] 

In [7]:
data_test = preprocessing.transform(test)

In [9]:
predictions = (model_xgb_loaded.predict_proba(data_test) * 0.3 + 
               model_lgbm_loaded.predict_proba(data_test) * (1 - 0.3))

test_csv['watched_pct'] = predictions[:, 1]

# Если нужен ответ вида true/false 
# test_csv['watched_pct'] = predictions[:, 1] >= 0.5

test_csv.to_csv('output_model.csv', index=False)
