In [1]:
! pip install --no-deps autoxgb

Collecting autoxgb
  Downloading autoxgb-0.2.2-py3-none-any.whl (20 kB)
Installing collected packages: autoxgb
Successfully installed autoxgb-0.2.2


In [2]:
import warnings

from pathlib import Path
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
plt.style.use('ggplot')

from autoxgb import AutoXGB

data = Path("/kaggle/input/song-popularity-prediction/")
train_df = pd.read_csv(data/'train.csv').sample(frac=1)
train_df = train_df.astype({'key': 'category', 'audio_mode': 'category'})

test_df = pd.read_csv(data/'test.csv')
test_df = test_df.astype({'key': 'category', 'audio_mode': 'category'})

In [3]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imputer = IterativeImputer(random_state=0, max_iter=10, initial_strategy='mean')

train_im = pd.DataFrame(imputer.fit_transform(train_df))
test_im = pd.DataFrame(imputer.fit_transform(test_df))

train_im.columns = train_df.columns
test_im.columns = test_df.columns

train_df = train_im
test_df = test_im

working_dir = Path('/kaggle/working')

train_df.to_csv(working_dir/'train_impu.csv', index=False)
test_df.to_csv(working_dir/'test_impu.csv', index=False)

In [4]:
###############################################################################
### required parameters
###############################################################################

# path to training data

train_filename = '/kaggle/working/train_impu.csv'
# path to output folder to store artifacts
output =  '/kaggle/working/auto_xgb'

###############################################################################
### optional parameters
###############################################################################

# path to test data. if specified, the model will be evaluated on the test data
# and test_predictions.csv will be saved to the output folder
# if not specified, only OOF predictions will be saved
# test_filename = "test.csv"
test_filename ='/kaggle/working/test_impu.csv'

# task: classification or regression
# if not specified, the task will be inferred automatically
# task = "classification"
# task = "regression"
task = "classification"

# an id column
# if not specified, the id column will be generated automatically with the name `id`
# idx = "id"
idx = 'id'

# target columns are list of strings
# if not specified, the target column be assumed to be named `target`
# and the problem will be treated as one of: binary classification, multiclass classification,
# or single column regression
# targets = ["target"]
# targets = ["target1", "target2"]
targets = ['song_popularity']

# features columns are list of strings
# if not specified, all columns except `id`, `targets` & `kfold` columns will be used
# features = ["col1", "col2"]
features = ['song_duration_ms', 'acousticness', 'danceability', 'energy',
       'instrumentalness', 'key', 'liveness', 'loudness', 'audio_mode',
       'speechiness', 'tempo', 'time_signature', 'audio_valence']

# categorical_features are list of strings
# if not specified, categorical columns will be inferred automatically
# categorical_features = ["col1", "col2"]
# categorical_features = ['key','audio_mode']

# use_gpu is boolean
# if not specified, GPU is not used
# use_gpu = True
# use_gpu = False
use_gpu = True

# number of folds to use for cross-validation
# default is 5
num_folds = 5

# random seed for reproducibility
# default is 42
seed = 42

# number of optuna trials to run
# default is 1000
# num_trials = 1000
num_trials = 10

# time_limit for optuna trials in seconds
# if not specified, timeout is not set and all trials are run
# time_limit = None
time_limit = 600

# if fast is set to True, the hyperparameter tuning will use only one fold
# however, the model will be trained on all folds in the end
# to generate OOF predictions and test predictions
# default is False
# fast = False
fast = False

In [5]:
# Now its time to train the model!
axgb = AutoXGB(
    train_filename=train_filename,
    output=output,
    test_filename=test_filename,
    task=task,
    idx=idx,
    targets=targets,
    features=features,
#     categorical_features=categorical_features,
    use_gpu=use_gpu,
    num_folds=num_folds,
    seed=seed,
    num_trials=num_trials,
    time_limit=time_limit,
    fast=fast,
)
axgb.train()

2022-01-31 11:50:31,839 INFO Output directory: /kaggle/working/auto_xgb
2022-01-31 11:50:31,841 INFO Reading training data
2022-01-31 11:50:31,977 INFO Mem. usage decreased to 1.22 Mb (73.3% reduction)
2022-01-31 11:50:31,983 INFO Problem type: binary_classification
2022-01-31 11:50:32,025 INFO Mem. usage decreased to 0.29 Mb (73.2% reduction)
2022-01-31 11:50:32,026 INFO Creating folds
2022-01-31 11:50:32,054 INFO Encoding target(s)
2022-01-31 11:50:32,070 INFO Found 0 categorical features.
2022-01-31 11:50:32,235 INFO Model config: train_filename='/kaggle/working/train_impu.csv' test_filename='/kaggle/working/test_impu.csv' idx='id' targets=['song_popularity'] problem_type=<ProblemType.binary_classification: 1> output='/kaggle/working/auto_xgb' features=['song_duration_ms', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'key', 'liveness', 'loudness', 'audio_mode', 'speechiness', 'tempo', 'time_signature', 'audio_valence'] num_folds=5 use_gpu=True seed=42 categorical_fe

In [6]:
test = pd.read_csv('/kaggle/working/auto_xgb/test_predictions.csv')
test

Unnamed: 0,id,0.0,1.0
0,0.0,0.593333,0.406667
1,1.0,0.530230,0.469770
2,2.0,0.729033,0.270967
3,3.0,0.740418,0.259582
4,4.0,0.632585,0.367415
...,...,...,...
9995,9990.0,0.651171,0.348829
9996,10000.0,0.689081,0.310919
9997,10000.0,0.614696,0.385304
9998,10000.0,0.583504,0.416496


In [7]:
test = pd.read_csv('/kaggle/working/auto_xgb/test_predictions.csv')
import numpy as np
sample_submission = pd.read_csv("../input/song-popularity-prediction/sample_submission.csv")
sample_submission['song_popularity'] = test['1.0']
sample_submission
# test['predictions'] = np.where(test['1.0'] >= 0.5 , 1,0)
# test.hist()

sample_submission.to_csv("submission.csv", index=False)

In [8]:
sample_submission

Unnamed: 0,id,song_popularity
0,0,0.406667
1,1,0.469770
2,2,0.270967
3,3,0.259582
4,4,0.367415
...,...,...
9995,9995,0.348829
9996,9996,0.310919
9997,9997,0.385304
9998,9998,0.416496
