In [1]:
import os
from google.colab import drive
gdrive_dir = '/content/drive/'
drive.mount(gdrive_dir, force_remount=True)
import pandas as pd
import numpy as np
from scipy.special import expit
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import f1_score

Mounted at /content/drive/


In [2]:
def transform_csv(initial_train_df, initial_geo_df, initial_referer_df, train_labels_df, flag):
  train_df=initial_train_df.reset_index()
  geo_df=initial_geo_df.reset_index()
  train_labels_df=train_labels_df.reset_index()
  referer_df=initial_referer_df.reset_index()
  referer_df.drop('index', axis=1, inplace=True)

  train_df=train_df.drop_duplicates(ignore_index=True)
  geo_df=geo_df.drop_duplicates(ignore_index=True)
  train_labels_df=train_labels_df.drop_duplicates(ignore_index=True)
  referer_df=referer_df.drop_duplicates(ignore_index=True)
  if not flag:
    #train_labels
    user_id=[]
    target=[]
    for i in range(len(train_labels_df)):
      text=train_labels_df['user_id;target'].iloc[i]
      arr=text.split(';')
      user_id.append(arr[0])
      target.append(arr[1])
    train_labels_df['user_id']=user_id
    train_labels_df['target']=target
    train_labels_df.drop(['index','user_id;target'], axis=1, inplace=True)
    targets_dist=dict(zip(train_labels_df['user_id'],train_labels_df['target']))
  #geo_df
  geo_id=[]
  region_id=[]
  for i in range(len(geo_df)):
    text=geo_df['geo_id;country_id;region_id;timezone_id'].iloc[i]
    arr=text.split(';')
    geo_id.append(arr[0])
    region_id.append(arr[2])
  geo_df['geo_id']=geo_id
  geo_df['region_id']=region_id
  geo_df['region_id'], _=pd.factorize(geo_df['region_id'])
  geo_df.drop(['geo_id;country_id;region_id;timezone_id','index'], axis=1, inplace=True)
  #referer_df
  components=[]
  https=[]
  for i in range(len(referer_df)):
    text=referer_df['component0;component1;component2;component3;component4;component5;component6;component7;component8;component9;referer'].iloc[i]
    arr=text.split(';')
    https.append(arr[-1])
    arr=list(map(int, arr[:-1]))
    components.append(arr)
  referer_df['https']=https
  referer_df['components']=components
  referer_df.drop('component0;component1;component2;component3;component4;component5;component6;component7;component8;component9;referer', axis=1, inplace=True)
  referer_df = referer_df.reset_index(drop=True)
  components_dist=dict(zip(referer_df['https'], referer_df['components']))

  #train_df
  if flag:
    user_id=[]
  else:
    targets=[]

  brw=[]
  region_id_train=[]
  components=[[] for _ in range(10)]
  for i in range(len(train_df)):
    text=train_df['level_0'].iloc[i]
    arr=text.split(';')

    if flag:
      user_id.append(arr[1])
    else:
      if arr[1] not in targets_dist:
        targets.append(-1)
      else:
        targets.append(int(targets_dist[arr[1]]))

    vec_comp=components_dist[arr[2]]
    for j in range(10):
      components[j].append(vec_comp[j])
    current_str=geo_df[geo_df['geo_id']==arr[3]]
    region_id_train.append(current_str['region_id'].iloc[0])
    brw.append(arr[4])

  for j in range(10):
      train_df[f'component_{j}'] = components[j]
  train_df['region_id']=region_id_train
  freq_map = train_df['region_id'].value_counts(normalize=True).to_dict()
  train_df['region_id'] = train_df['region_id'].map(freq_map)

  train_df['brw']=brw
  train_df=train_df.rename(columns={'level_2':'OS'})

  top_brw = train_df['brw'].value_counts().head(3).index
  train_df['brw_processed'] = train_df['brw'].where(train_df['brw'].isin(top_brw), 'other_brw')
  top_os = train_df['OS'].value_counts().head(3).index
  train_df['OS_processed'] = train_df['OS'].where(train_df['OS'].isin(top_os), 'other_os')
  categorical_cols = ['brw_processed', 'OS_processed']
  encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
  encoded_array = encoder.fit_transform(train_df[categorical_cols])
  encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out(categorical_cols))
  train_df = pd.concat([train_df.drop(['brw', 'OS', 'brw_processed', 'OS_processed'], axis=1),encoded_df], axis=1)

  train_df.drop(['level_0','level_1','request_ts;user_id;referer;geo_id;user_agent'], axis=1, inplace=True)
  if flag:
    train_df['user_id']=user_id
  else:
    train_df['target']=targets
    train_df=train_df[train_df['target']!=-1]
  return train_df

In [3]:
class Log_regression:
  def __init__(self,x_train, y_train, x_val1, y_val1, x_val2, y_val2):
    self.__x_train=np.array(x_train)
    self.__y_train=np.array(y_train)
    self.__x_val1=np.array(x_val1)
    self.__y_val1=np.array(y_val1)
    self.__x_val2=np.array(x_val2)
    self.__y_val2=np.array(y_val2)
    self.__omega=np.zeros(len(self.__x_train[0]))
    self.__n_iter=200
    self.__threshold=0.5
    self.__alpha=None
    self.__tolerance=0.1
    self.__fit()

  def __fit(self):
    #fit alpha and omega
    alphas=np.logspace(-7,-4,4)
    omega_alpha_dist={}
    metric_alpha_dist={}
    metric_arr=[]
    for alpha in alphas:
      start_omega=np.zeros(len(self.__x_train[0]))
      current_omega=self.__train(start_omega, alpha)
      omega_alpha_dist[alpha]=current_omega
      y_probabilities=self.f(self.__x_val1, current_omega)
      metric=roc_auc_score(self.__y_val1, y_probabilities)
      metric_alpha_dist[metric]=alpha
      metric_arr.append(metric)
    metric_arr.sort()
    self.__alpha=metric_alpha_dist[metric_arr[-1]]
    self.__omega=omega_alpha_dist[self.__alpha]
    #fit threshold
    fpr, tpr, thresholds = roc_curve(self.__y_val2, self.f(self.__x_val2, self.__omega))
    youden_j = tpr - fpr
    self.__threshold = thresholds[np.argmax(youden_j)]
  def __train(self, current_omega,alpha):
    count_iter=0
    prev_L=0
    current_L=0
    while( ((abs(prev_L-current_L)>self.__tolerance)and(count_iter<self.__n_iter)) or (count_iter<2) ):
      if count_iter>0:
        prev_L=current_L
      current_L=self.L(current_omega, self.__x_train, self.__y_train)
      grad_L=self.dL_domega(current_omega, self.__x_train, self.__y_train)
      current_omega=current_omega-alpha*grad_L
      count_iter+=1
    return current_omega
  def L(self,omega,x,y):
    sigmoid_values = self.f(x, omega)
    clipped_values = np.clip(sigmoid_values, 1e-10, 1.0 - 1e-10)
    log_values1=np.log(clipped_values)
    comp1=y @ log_values1
    log_values2=np.log(1-clipped_values)
    comp2=(1-y) @ log_values2
    return -(comp1+comp2)
  def f(self,x,omega):
    return expit(x @ omega)
  def dL_domega(self,omega,x,y):
    grad=np.zeros(len(omega))
    for k in range(len(omega)):
      column_k=x[:,k]
      sigmoid_values = self.f(x, omega)
      xy1=column_k*y
      xy2=column_k*(1-y)
      comp1=xy1 @ (1-sigmoid_values)
      comp2=xy2 @ sigmoid_values
      grad[k]=comp2-comp1
    return grad
  def getter_alpha(self):
    return self.__alpha
  def getter_omega(self):
    return self.__omega
  def getter_threshold(self):
    return self.__threshold
  def predict(self, test_users_df, test_y, test_x):#test_users это dataframe юзеров, test_y список id юзеров из test_df, test_x это список значений фич сэмплов
    y_prob=self.f(test_x, self.__omega)
    temp_df = pd.DataFrame({'user_id': test_y, 'probability': y_prob})
    avg_prob = temp_df.groupby('user_id')['probability'].mean().reset_index()
    test_users_df = test_users_df.merge(avg_prob, on='user_id', how='left')
    test_users_df['prediction'] = np.where(test_users_df['probability'] >= self.__threshold, 1, 0)
    test_users_df.drop('probability', axis=1, inplace=True)
    return test_users_df

In [None]:
# geo_df_init=pd.read_csv('/content/drive/MyDrive/predict_gender/geo_info.csv')
# referer_df_init=pd.read_csv('/content/drive/MyDrive/predict_gender/referer_vectors.csv')
# train_labels_df=pd.read_csv('/content/drive/MyDrive/predict_gender/train_labels.csv')

In [None]:
# train_df_init=pd.read_csv('/content/drive/MyDrive/predict_gender/train.csv')
# data_set=transform_csv(train_df_init,geo_df_init, referer_df_init, train_labels_df,False)
# test_df_init=pd.read_csv('/content/drive/MyDrive/predict_gender/test.csv')
# test_df=transform_csv(test_df_init, geo_df_init, referer_df_init, train_labels_df, True)

In [4]:
test_users_df=pd.read_csv('/content/drive/MyDrive/predict_gender/test_users.csv')
test_users_df=test_users_df.drop_duplicates(ignore_index=True)
data_set=pd.read_csv('/content/drive/MyDrive/predict_gender/processed/dataset.csv')
test_df=pd.read_csv('/content/drive/MyDrive/predict_gender/processed/test_df.csv')

In [None]:
# pd.set_option('display.max_columns', None)

In [6]:
components = [f'component_{i}' for i in range(10)]
X_components = data_set[components].values
scaler = StandardScaler()
X_components_scaled = scaler.fit_transform(X_components)
other_features = data_set.drop(columns=['target'] + components).values
X = np.concatenate([X_components_scaled, other_features], axis=1)
Y=data_set['target'].to_numpy()

x_train, x_other1, y_train, y_other1 = train_test_split(
    X, Y,
    test_size=0.4,
    random_state=42
)
x_val1, x_other2, y_val1, y_other2 = train_test_split(
    x_other1, y_other1,
    test_size=0.625,
    random_state=42
)
x_val2, x_test, y_val2, y_test = train_test_split(
    x_other2, y_other2,
    test_size=0.4,
    random_state=42
)
#x_train y_train (60%); x_val1 y_val1 (15%); x_val2 y_val2 (15%), x_test y_test (10%)

In [7]:
my_log_reg=Log_regression(x_train, y_train, x_val1, y_val1, x_val2, y_val2)
#тест модели по метрике roc
y_prob=my_log_reg.f(x_test, my_log_reg._Log_regression__omega)
roc=roc_auc_score(y_test,y_prob)
print(roc)

np.float64(0.7211202197065136)

In [10]:
y_pred=np.where(y_prob >= my_log_reg._Log_regression__threshold, 1, 0)
f1=f1_score(y_test,y_pred)
print(f1)

0.636991586832546


In [11]:
components = [f'component_{i}' for i in range(10)]
X_components = test_df[components].values
scaler = StandardScaler()
X_components_scaled =scaler.fit_transform(X_components)
other_features =test_df.drop(columns=['user_id'] + components).values
test_x =np.concatenate([X_components_scaled, other_features], axis=1)
test_y =test_df['user_id'].to_numpy()
test_users_df=my_log_reg.predict(test_users_df,test_y,test_x)
test_users_df

Unnamed: 0,user_id,prediction
0,c2802dadd33d8ae09bb366bdd41212ea,0
1,e5b1988db74527ec092f28b0bbfdaac9,0
2,6ef1eedbdb72554e53e69782066065c5,0
3,7e057293ecae62985a327b7af51858ea,0
4,a27bd7ce8828497823fa8d5d05e7bbf7,0
...,...,...
84995,7f18ead960fd4762767a40e58c0f2237,1
84996,336f6e34fdaa6726c4881fe4f9576bce,0
84997,e8e2fe7f5a37fd89df87062da82aa891,0
84998,676e7b7340fbed94ee733109d09e4688,0


In [17]:
import pickle
from google.colab import files
with open('my_log_reg.pkl', 'wb') as file:
    pickle.dump(my_log_reg, file)
files.download('my_log_reg.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>