<a href="https://colab.research.google.com/github/amanteur/MLDM_KaggleProject/blob/main/MLDM_project_model_data_preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List
import random

%matplotlib inline

# Errors handling

In [None]:
# PATHS
PATH = '/content/drive/MyDrive/datasets/kaggle_musrec/preprocessed/'
FEATURES_PATHS = {'train': 'train_expanded_result.csv',
                  'test': 'test_expanded_result.csv'}

In [None]:
cols = ['msno', 'song_id', 'source_system_tab', 'source_screen_name', 'source_type',
        'target', 'user_plays', 'song_plays', 'user_repeat_ratio', 'song_repeat_ratio', 'city', 'bd', 'gender', 'registered_via',
        'membership_days', 'artist_name', 'composer', 'lyricist', 'language', 'genre_ids',
        'song_length_cat', 'artist_name_number', 'composer_number', 'lyricist_number',
        'genre_number', 'name', 'decade', 'country', 'genre_first', 'source_type_ratio', 
        'source_system_tab_ratio', 'artist_name_ratio', 'composer_ratio']
test_cols = ['msno', 'song_id', 'source_system_tab', 'source_screen_name', 'source_type', 'user_plays', 'song_plays',
            'user_repeat_ratio', 'song_repeat_ratio', 'city', 'bd', 'gender', 'registered_via',
            'membership_days', 'artist_name', 'composer', 'lyricist', 'language', 'genre_ids',
            'song_length_cat', 'artist_name_number', 'composer_number', 'lyricist_number',
            'genre_number', 'name', 'decade', 'country', 'genre_first', 'source_type_ratio', 
            'source_system_tab_ratio', 'artist_name_ratio', 'composer_ratio']

In [None]:
train = pd.read_csv(PATH + FEATURES_PATHS['train'], usecols=cols)

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
test = pd.read_csv(PATH + FEATURES_PATHS['test'], usecols=test_cols)

  interactivity=interactivity, compiler=compiler, result=result)


## Clearing data

In [None]:
def preprocess(df, test=False):
  # language
  df['language'] = df['language'].apply(str)
  # song_length_cat
  df['song_length_cat'] = df['song_length_cat'].apply(str)
  # decade
  df['decade'] = df['decade'].apply(str)
  # artist_name_number 
  df['artist_name_number'] = pd.to_numeric(df['artist_name_number'], errors='coerce').clip(upper=4)
  # composer_number
  df['composer_number'] = pd.to_numeric(df['composer_number'], errors='coerce').clip(upper=6)
  # lyricist number
  df['lyricist_number'] = pd.to_numeric(df['lyricist_number'], errors='coerce').clip(upper=4)
  # genre_number
  df['genre_number'] = pd.to_numeric(df['genre_number'], errors='coerce').clip(upper=3)
  if test:
    df['genre_first'] = df['genre_first'].apply(str)
    df['user_repeat_ratio'] = pd.to_numeric(df['user_repeat_ratio'], errors='coerce')
    df['song_repeat_ratio'] = pd.to_numeric(df['song_repeat_ratio'], errors='coerce')
  return df

In [None]:
train = preprocess(train)

In [None]:
test = preprocess(test, test=True)

In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7377418 entries, 0 to 7377417
Data columns (total 33 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   msno                     object 
 1   song_id                  object 
 2   source_system_tab        object 
 3   source_screen_name       object 
 4   source_type              object 
 5   target                   int64  
 6   user_plays               int64  
 7   song_plays               int64  
 8   user_repeat_ratio        float64
 9   song_repeat_ratio        float64
 10  city                     int64  
 11  bd                       int64  
 12  gender                   float64
 13  registered_via           int64  
 14  membership_days          int64  
 15  artist_name              object 
 16  composer                 object 
 17  lyricist                 object 
 18  language                 object 
 19  genre_ids                object 
 20  song_length_cat          object 
 21  artist_n

In [None]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2556790 entries, 0 to 2556789
Data columns (total 32 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   msno                     object 
 1   song_id                  object 
 2   source_system_tab        object 
 3   source_screen_name       object 
 4   source_type              object 
 5   user_plays               int64  
 6   song_plays               int64  
 7   user_repeat_ratio        float64
 8   song_repeat_ratio        float64
 9   city                     int64  
 10  bd                       int64  
 11  gender                   float64
 12  registered_via           int64  
 13  membership_days          int64  
 14  artist_name              object 
 15  composer                 object 
 16  lyricist                 object 
 17  language                 object 
 18  genre_ids                object 
 19  song_length_cat          object 
 20  artist_name_number       float64
 21  composer

## train val split

In [None]:
train_size = train.shape[0]
alpha = 0.8
pct = int(alpha * train_size)
train_split = train[:pct].copy()
val_split = train[pct:].copy()

In [None]:
val_split.drop(columns=['user_repeat_ratio', 'song_repeat_ratio', 'source_type_ratio', 'source_system_tab_ratio', 'artist_name_ratio', 'composer_ratio'], inplace=True)

In [None]:
SAVE_PATH = '/content/drive/MyDrive/datasets/kaggle_musrec/prepr_v2/'

In [None]:
train_split.to_csv(SAVE_PATH + 'train.csv')
val_split.to_csv(SAVE_PATH + 'val.csv')
test.to_csv(SAVE_PATH + 'test.csv')

# Reading again

In [None]:
SAVE_PATH = '/content/drive/MyDrive/datasets/kaggle_musrec/prepr_v2/'

In [None]:
train = pd.read_csv(SAVE_PATH + 'train.csv', 
                    dtype={'language': str,
                          'song_length_cat': str,
                          'decade': str},
                    index_col=0)
val = pd.read_csv(SAVE_PATH + 'val.csv', dtype={'language': str,
                                                'song_length_cat': str,
                                                'decade': str},
                    index_col = 0)
test = pd.read_csv(SAVE_PATH + 'test.csv', dtype={'genre_first':str,
                                                  'language': str,
                                                  'song_length_cat': str,
                                                  'decade': str},
                    index_col = 0)

  mask |= (ar1 == a)


In [None]:
for col in train.columns:
  if train[col].dtype == object:
    train[col] = train[col].astype('category')
    val[col] = val[col].astype('category')

In [None]:
for col in test.columns:
  if test[col].dtype == object:
    test[col] = test[col].astype('category')

In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5901934 entries, 0 to 5901933
Data columns (total 33 columns):
 #   Column                   Dtype   
---  ------                   -----   
 0   msno                     category
 1   song_id                  category
 2   source_system_tab        category
 3   source_screen_name       category
 4   source_type              category
 5   target                   int64   
 6   user_plays               int64   
 7   song_plays               int64   
 8   user_repeat_ratio        float64 
 9   song_repeat_ratio        float64 
 10  city                     int64   
 11  bd                       int64   
 12  gender                   float64 
 13  registered_via           int64   
 14  membership_days          int64   
 15  artist_name              category
 16  composer                 category
 17  lyricist                 category
 18  language                 category
 19  genre_ids                category
 20  song_length_cat         

In [None]:
val.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1475484 entries, 5901934 to 7377417
Data columns (total 27 columns):
 #   Column              Non-Null Count    Dtype   
---  ------              --------------    -----   
 0   msno                1475484 non-null  category
 1   song_id             1475484 non-null  category
 2   source_system_tab   1475484 non-null  category
 3   source_screen_name  1475484 non-null  category
 4   source_type         1475484 non-null  category
 5   target              1475484 non-null  int64   
 6   user_plays          1475484 non-null  int64   
 7   song_plays          1475484 non-null  int64   
 8   city                1475484 non-null  int64   
 9   bd                  1475484 non-null  int64   
 10  gender              1475484 non-null  float64 
 11  registered_via      1475484 non-null  int64   
 12  membership_days     1475484 non-null  int64   
 13  artist_name         1475484 non-null  category
 14  composer            1475484 non-null  catego

## RATIOS!!!

In [None]:
# 'user_repeat_ratio', 'song_repeat_ratio', 'source_type_ratio', 'source_system_tab_ratio', 'artist_name_ratio', 'composer_ratio'
def add_ratio_column(val_df: pd.DataFrame, 
                     train_df: pd.DataFrame, 
                     target_col: str, 
                     id_col: list):
  if len(id_col) == 1:
    id_col = id_col[0]
    # adding new column
    val_list = set(train_df[id_col]).intersection(val_df[id_col])
    val_dict = train_df.loc[train_df[id_col].isin(val_list)][[id_col, target_col]].set_index(id_col).to_dict()
    val_df[target_col] = val_df[id_col].map(val_dict[target_col])
    # dealing with nans
    val_df[target_col] = val_df[target_col].fillna(train_df[target_col].median())
  else:
    val_df = val_df.merge(train_df.drop_duplicates(subset=id_col)[id_col+[target_col]], how='left', on=id_col)
    for col in id_col:
      df_ratio = val_df[[col]].merge(train_df.groupby(col)[target_col].median().reset_index(), how='left', on=[col])
      val_df[target_col] = np.where(val_df[target_col].isna(), df_ratio[target_col], val_df[target_col])
    val_df[target_col] = val_df[target_col].fillna(train_df[target_col].median())
  return val_df

In [None]:
col_dict = {'user_repeat_ratio': ['msno'], 
            'song_repeat_ratio': ['song_id'],
            'source_type_ratio': ['msno', 'source_type'],
            'source_system_tab_ratio': ['msno', 'source_system_tab'], 
            'artist_name_ratio': ['msno', 'artist_name'], 
            'composer_ratio': ['msno', 'composer']}

In [None]:
for k, v in col_dict.items():
  val = add_ratio_column(val, train, k, v)

## Saving again

In [None]:
SAVE_PATH_v3 = '/content/drive/MyDrive/datasets/kaggle_musrec/prepr_v3/'

In [None]:
train.to_csv(SAVE_PATH_v3 + 'train.csv')
val.to_csv(SAVE_PATH_v3 + 'val.csv')
test.to_csv(SAVE_PATH_v3 + 'test.csv')

# Reading again again

In [None]:
train = pd.read_csv(SAVE_PATH + 'train.csv', 
                    dtype={'language': str,
                          'song_length_cat': str,
                          'decade': str},
                    index_col=0)
val = pd.read_csv(SAVE_PATH + 'val.csv', dtype={'language': str,
                                                'song_length_cat': str,
                                                'decade': str},
                    index_col = 0)
test = pd.read_csv(SAVE_PATH + 'test.csv', dtype={'genre_first':str,
                                                  'language': str,
                                                  'song_length_cat': str,
                                                  'decade': str},
                    index_col = 0)

# Model

In [None]:
import lightgbm as lgb
import gc

In [None]:
X_train = train.drop(['target'], axis=1)
y_train = train['target'].values

X_val = val.drop(['target'], axis=1)
y_val = val['target'].values

del train, val; gc.collect()

d_train = lgb.Dataset(X_train, y_train)
d_val = lgb.Dataset(X_val, y_val)

In [None]:
print('Training LGBM model...')
params = {}
params['learning_rate'] = 0.2
params['application'] = 'binary'
params['max_depth'] = 8
params['num_leaves'] = 2**8
params['verbosity'] = 0
params['metric'] = 'auc'

model = lgb.train(params, 
                  train_set=d_train, 
                  num_boost_round=50, 
                  valid_sets=d_val,
                  verbose_eval=5)

Training LGBM model...
[5]	valid_0's auc: 0.530634
[10]	valid_0's auc: 0.608904
[15]	valid_0's auc: 0.623791
[20]	valid_0's auc: 0.629328
[25]	valid_0's auc: 0.644016
[30]	valid_0's auc: 0.66177
[35]	valid_0's auc: 0.663229
[40]	valid_0's auc: 0.6667
[45]	valid_0's auc: 0.665565
[50]	valid_0's auc: 0.665001


In [None]:
print('Making predictions and saving them...')
X_val = val.drop(['target'], axis=1)
y_val = val['target'].values
val_pred = model.predict(X_val)