In [1]:
# Guide: https://www.tensorflow.org/decision_forests/tutorials/beginner_colab

In [695]:
# Standard stack
import datetime
import pandas as pd
import numpy as np
from datetime import date
import re

# Visualization
from pandas_profiling import ProfileReport
#import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

# Modeling
import tensorflow as tf
import tensorflow_decision_forests as tfdf
try:
    from wurlitzer import sys_pipes
except:
    from colabtools.googlelog import CaptureLog as sys_pipes

# Scikit-learn packages
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import KFold

# display
from IPython.core.magic import register_line_magic
from IPython.display import Javascript

# Constants

In [696]:
TRAIN_PATH = "data/train.csv"
TEST_PATH = "data/test.csv"
META_PATH = "data/artists.csv"
label = "popularity_song"

# Load Data

In [697]:
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
meta = pd.read_csv(META_PATH)
submission = pd.read_csv(TEST_PATH)

In [698]:
meta.head()

Unnamed: 0,id,followers,genres,name,popularity
0,55CXG5KDJpRYwBopfYAJHa,21756,"['country blues', 'country rock', 'piedmont bl...",Jorma Kaukonen,40
1,08mjMUUjyTchMHCW7evc3R,640993,['turkish pop'],Hande Yener,62
2,3Ebn7mKYzD0L3DaUB1gNJZ,161509,"['celtic', 'irish folk']",Christy Moore,56
3,7GfaHcpmNcrcHoyGnOBsAz,9578,"['kindermusik', 'kleine hoerspiel']",Die Biene Maja,56
4,1DYXGLnfNDt8mO2aK9k83j,48876,"['opm', 'vispop']",Jay-R Siaboc,39


In [699]:
meta['genres'] = meta['genres'].str.replace("[", '')
meta['genres'] = meta['genres'].str.replace("]", '')
meta['genres'] = meta['genres'].str.replace("'", '')
meta['num_genres'] = meta['genres'].apply(lambda x: len(str(x).split(',')))

  meta['genres'] = meta['genres'].str.replace("[", '')
  meta['genres'] = meta['genres'].str.replace("]", '')


In [700]:
new = meta["genres"].str.split(",",expand = True)

In [701]:
new = new.rename(columns={0:'0',1:'1',2:'2',3:'3',4:'4',5:'5',6:'6',7:'7',8:'8',9:'9',10:'10',11:'11',12:'12',13:'13',14:'14',15:'15',16:'16'})

In [702]:
meta = pd.concat([meta,new], axis=1)

In [703]:
meta

Unnamed: 0,id,followers,genres,name,popularity,num_genres,0,1,2,3,...,7,8,9,10,11,12,13,14,15,16
0,55CXG5KDJpRYwBopfYAJHa,21756,"country blues, country rock, piedmont blues",Jorma Kaukonen,40,3,country blues,country rock,piedmont blues,,...,,,,,,,,,,
1,08mjMUUjyTchMHCW7evc3R,640993,turkish pop,Hande Yener,62,1,turkish pop,,,,...,,,,,,,,,,
2,3Ebn7mKYzD0L3DaUB1gNJZ,161509,"celtic, irish folk",Christy Moore,56,2,celtic,irish folk,,,...,,,,,,,,,,
3,7GfaHcpmNcrcHoyGnOBsAz,9578,"kindermusik, kleine hoerspiel",Die Biene Maja,56,2,kindermusik,kleine hoerspiel,,,...,,,,,,,,,,
4,1DYXGLnfNDt8mO2aK9k83j,48876,"opm, vispop",Jay-R Siaboc,39,2,opm,vispop,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17713,0qU8UdzIf1I2ZxyI5OHdut,1458,,Josef Strauss,35,1,,,,,...,,,,,,,,,,
17714,5Z4mEhHsWHyS10omStJ81u,25718,classic greek rock,Pavlos Sidiropoulos,38,1,classic greek rock,,,,...,,,,,,,,,,
17715,6OAUJXe2dPNUp5eMkh1ErT,224,indie boliviano,H3O,19,1,indie boliviano,,,,...,,,,,,,,,,
17716,786hGmAEXHUeCdKPAj3JIa,178395,"jawaiian, nz hip hop, nz reggae, polynesian pop",Katchafire,57,4,jawaiian,nz hip hop,nz reggae,polynesian pop,...,,,,,,,,,,


In [704]:
# Join Artists
def join_metadata(dataset1, dataset2):
    dataset1['id_artists'] = dataset1['id_artists'].str.replace("[", '')
    dataset1['id_artists'] = dataset1['id_artists'].str.replace("]", '')
    dataset1['id_artists'] = dataset1['id_artists'].str.replace("'", '')
    dataset1 = dataset1.merge(dataset2, left_on='id_artists', right_on='id', how='left', suffixes=('_song', '_artists'))
    return dataset1

train = join_metadata(train, meta)
test = join_metadata(test, meta)

  dataset1['id_artists'] = dataset1['id_artists'].str.replace("[", '')
  dataset1['id_artists'] = dataset1['id_artists'].str.replace("]", '')


In [705]:
train.head(1)

Unnamed: 0,id_song,name_song,popularity_song,duration_ms,artists,id_artists,danceability,energy,key,loudness,...,7,8,9,10,11,12,13,14,15,16
0,269,blun7 a swishland,63,167760.0,['tha Supreme'],19i93sA0D7yS9dYoVNBqAA,0.692,0.792,7,-5.984,...,,,,,,,,,,


# Preprocess

### Drop Columns

In [709]:
train.columns

Index(['name_song', 'popularity_song', 'duration_ms', 'artists',
       'danceability', 'energy', 'key', 'loudness', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'release_year', 'release_month', 'release_day', 'followers', 'genres',
       'name_artists', 'popularity_artists', 'num_genres', '0', '1', '2', '3',
       '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16'],
      dtype='object')

In [707]:
to_drop = ['name_song','id_song','id_artists','artists']

In [710]:
train = train.drop(to_drop,axis=1)
test = test.drop(to_drop,axis=1)
test = test.rename(columns={'popularity':'popularity_artists'})

KeyError: "['id_song' 'id_artists'] not found in axis"

### Missing Values

In [711]:
def inpute_missing(dataset):
    """ 
    Edit this to fix nulls. Default version replaces all int/float with 0
    """
    for col in dataset.columns:
        if dataset[col].dtype not in [str, object]:
            dataset[col] = dataset[col].fillna(dataset[col].mean())
    return dataset

train = inpute_missing(train)
test = inpute_missing(test)

In [712]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21000 entries, 0 to 20999
Data columns (total 39 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   name_song           20999 non-null  object 
 1   popularity_song     21000 non-null  int64  
 2   duration_ms         21000 non-null  float64
 3   artists             21000 non-null  object 
 4   danceability        21000 non-null  float64
 5   energy              21000 non-null  float64
 6   key                 21000 non-null  int64  
 7   loudness            21000 non-null  float64
 8   speechiness         21000 non-null  float64
 9   acousticness        21000 non-null  float64
 10  instrumentalness    21000 non-null  float64
 11  liveness            21000 non-null  float64
 12  valence             21000 non-null  float64
 13  tempo               21000 non-null  float64
 14  release_year        21000 non-null  int64  
 15  release_month       21000 non-null  float64
 16  rele

### NLP

In [713]:
def nlp_transforms(dataset):
    """ 
    NLP tranforms here. Default, None...
    """
    return dataset

train = nlp_transforms(train)
test = nlp_transforms(test)

### Computations

In [714]:
train.columns

Index(['name_song', 'popularity_song', 'duration_ms', 'artists',
       'danceability', 'energy', 'key', 'loudness', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'release_year', 'release_month', 'release_day', 'followers', 'genres',
       'name_artists', 'popularity_artists', 'num_genres', '0', '1', '2', '3',
       '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16'],
      dtype='object')

In [715]:
def computation_transforms(dataset):
    """ 
    New Feature Engineering. Default, None...
    """
    date=dataset.apply(lambda x: datetime.date(int(x['release_year']), int(x['release_month']), int(x['release_day'])),axis=1)
    dataset['date']=date
    dataset['date'] = pd.to_datetime(dataset['date'])
    
    dataset['today'] =datetime.date.today()
    dataset['today'] = pd.to_datetime(dataset['today'])

    
    #dataset['new_feat_1'] = dataset['popularity_song'] - dataset['popularity_artists']
    
    #dataset['new_feat_2'] = dataset['today'] - dataset['date']
    #dataset['new_feat_2'] = dataset['new_feat_2'].dt.days
    
    dataset = dataset.drop(['date','today'], axis=1)
    return dataset

train = computation_transforms(train)
test = computation_transforms(test)

# Cross Validate

In [716]:
# The hyper-parameter templates of the Gradient Boosted Tree model.
print(tfdf.keras.GradientBoostedTreesModel.predefined_hyperparameters())

[HyperParameterTemplate(name='better_default', version=1, parameters={'growing_strategy': 'BEST_FIRST_GLOBAL'}, description='A configuration that is generally better than the default parameters without being more expensive.'), HyperParameterTemplate(name='benchmark_rank1', version=1, parameters={'growing_strategy': 'BEST_FIRST_GLOBAL', 'categorical_algorithm': 'RANDOM', 'split_axis': 'SPARSE_OBLIQUE', 'sparse_oblique_normalization': 'MIN_MAX', 'sparse_oblique_num_projections_exponent': 1.0}, description='Top ranking hyper-parameters on our benchmark slightly modified to run in reasonable time.')]


In [717]:
models = {
    #'rf_default': tfdf.keras.RandomForestModel(),
    #'gbt_default': tfdf.keras.GradientBoostedTreesModel(task = tfdf.keras.Task.REGRESSION),
    'gbt_tune1': tfdf.keras.GradientBoostedTreesModel(hyperparameter_template="benchmark_rank1",task = tfdf.keras.Task.REGRESSION),
    #'gbt_tune3': tfdf.keras.GradientBoostedTreesModel(num_trees=500,
    #                                                  growing_strategy="BEST_FIRST_GLOBAL",
    #                                                  max_depth=8,
    #                                                  split_axis="SPARSE_OBLIQUE",
    #                                                  categorical_algorithm="RANDOM",
    #                                                  early_stopping="LOSS_INCREASE"
    #                                                  ),
    }

In [718]:
train_df, test_df = train_test_split(train)
train_tf = tfdf.keras.pd_dataframe_to_tf_dataset(train_df, label=label)
test_tf = tfdf.keras.pd_dataframe_to_tf_dataset(test_df, label=label)

gbt_tune1
Running fold 1
Running fold 2
Running fold 3
Running fold 4
Running fold 5
Running fold 6
Running fold 7
Running fold 8
Running fold 9
Running fold 10
Cross-validated Score: 82.24455947875977 for model: gbt_tune1


# Test Best Models

In [1]:
train_df, test_df = train_test_split(train, random_state=4222, test_size=0.1)
train_tf = tfdf.keras.pd_dataframe_to_tf_dataset(train_df, label=label, task=tfdf.keras.Task.REGRESSION)
test_tf = tfdf.keras.pd_dataframe_to_tf_dataset(test_df, label=label, task=tfdf.keras.Task.REGRESSION)

NameError: name 'train_test_split' is not defined

In [720]:
evaluation = {}
for key in models:
    print(key)
    #Evaluate: metric logsloss: BinaryCrossentropy
    models[key].compile(metrics=["mse"])
    
    #with sys_pipes():
    models[key].fit(x=train_tf)
    evaluation[key] = models[key].evaluate(test_tf, return_dict=True)

gbt_tune1


In [212]:
evaluation = {}
for key in models:
    print(key)
    #Evaluate: metric logsloss: BinaryCrossentropy
    models[key].compile(metrics=["mse"])
    
    #with sys_pipes():
    models[key].fit(x=train_tf)
    evaluation[key] = models[key].evaluate(test_tf, return_dict=True)

gbt_default
gbt_tune1


# Train Final Model

In [591]:
#test =  test.drop('popularity')

In [592]:
train_tf = tfdf.keras.pd_dataframe_to_tf_dataset(train, label=label, task=tfdf.keras.Task.REGRESSION)

In [593]:
models['gbt_tune1']

<tensorflow_decision_forests.keras.GradientBoostedTreesModel at 0x7f7300560c70>

In [594]:
# A more complex, but possibly, more accurate model.
model = models['gbt_tune1']

model.compile(metrics=["mse"])
model.fit(train_tf)



<tensorflow.python.keras.callbacks.History at 0x7f73006e8910>

# Predictions

In [595]:
predictions = tfdf.keras.pd_dataframe_to_tf_dataset(test, task=tfdf.keras.Task.REGRESSION)

In [596]:
scores = model.predict(predictions)

In [597]:
scores

array([[ 2.3010674],
       [13.626199 ],
       [27.31776  ],
       ...,
       [-1.6375866],
       [14.879572 ],
       [ 5.4220276]], dtype=float32)

In [598]:
submission[label] = scores

In [599]:
submission = submission[['id','popularity_song']]

In [600]:
submission = submission.rename(columns={'popularity_song':'popularity'})

In [601]:
submission

Unnamed: 0,id,popularity
0,21594,2.301067
1,23733,13.626199
2,17440,27.317760
3,20756,30.838421
4,29495,22.767576
...,...,...
8995,27490,23.687746
8996,22759,25.410677
8997,1853,-1.637587
8998,1658,14.879572


In [602]:
submission.to_csv('submission.csv', index=False)

In [603]:
#temp = pd.read_csv('submission.csv')
#temp

In [604]:
!kaggle competitions submit -c sliced-s01e08-KJSEks -f submission.csv -m "Message"

100%|█████████████████████████████████████████| 135k/135k [00:00<00:00, 229kB/s]
Successfully submitted to SLICED s01e08