In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from tqdm import tqdm

In [2]:
data = pd.read_csv("basic_audio.csv")
Y = data.year;
X = data.drop('year', axis = 1)
X = X.drop('idx', axis = 1)

In [3]:
# "Floor" the years
for item in tqdm(range(0, len(Y))):
    Y.iloc[item] = int(Y.iloc[item] / 10) * 10

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
100%|██████████| 7293/7293 [02:56<00:00, 41.31it/s]


In [4]:
random_state = 100
np.random.seed(random_state)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.2, random_state = random_state)

In [5]:
learning_range = [.01]
alpha_range = np.arange(.1, .9, .1)
lamda_range = np.arange(.1, .5, .1)
estimators_range = [25, 50]
max_depth_range = [20, 30]

params = dict(reg_alpha = alpha_range,
    reg_lambda = lamda_range,
    max_depth = max_depth_range,
    learning_rate = learning_range,
    n_estimators = estimators_range,
)

In [6]:
model = xgb.XGBClassifier(random_state = random_state)
grid_search = GridSearchCV(model, params, cv=5, verbose=1, n_jobs = 6)
grid_search.fit(x_train, y_train)
output = grid_search.predict(x_test)

Fitting 5 folds for each of 128 candidates, totalling 640 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:   19.8s
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed:  1.6min
[Parallel(n_jobs=6)]: Done 438 tasks      | elapsed:  4.1min
[Parallel(n_jobs=6)]: Done 640 out of 640 | elapsed:  6.6min finished


In [7]:
print(grid_search.best_params_)

{'learning_rate': 0.01, 'max_depth': 20, 'n_estimators': 50, 'reg_alpha': 0.5, 'reg_lambda': 0.4}


In [8]:
output = grid_search.predict(x_test)

In [9]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, output)

0.9280328992460589

In [10]:
output = grid_search.predict(X)

In [11]:
print(output)

[1990. 1960. 2000. ... 2000. 1990. 2000.]


In [12]:
accuracy_score(Y, output)

0.9635266694090223

In [18]:
from sklearn.metrics import mean_squared_error
mean_squared_error(Y, output) 

16.495269436445906

In [13]:
output_prob = grid_search.predict_proba(X)

In [14]:
output_year = pd.DataFrame(output, columns = ['year_predict'])
prob_df = pd.DataFrame(output_prob)

In [15]:
combined_output = prob_df.join(output_year)

In [16]:
print(combined_output)

             0         1         2         3         4         5         6  \
0     0.073057  0.073096  0.073097  0.073176  0.073114  0.073381  0.073323   
1     0.085356  0.085402  0.085403  0.085496  0.224500  0.085735  0.085667   
2     0.073113  0.073152  0.073153  0.073232  0.073170  0.073437  0.073379   
3     0.071891  0.071929  0.071930  0.072009  0.071947  0.090648  0.331908   
4     0.073257  0.073296  0.073297  0.073377  0.073314  0.073582  0.073523   
5     0.073257  0.073296  0.073297  0.073377  0.073314  0.073582  0.073523   
6     0.073241  0.073281  0.073281  0.073361  0.073299  0.073567  0.338143   
7     0.073257  0.073296  0.073297  0.073377  0.073314  0.073582  0.073523   
8     0.072487  0.072526  0.072527  0.072606  0.072544  0.074150  0.072751   
9     0.072009  0.072047  0.072048  0.072126  0.072065  0.072328  0.073521   
10    0.073268  0.073307  0.100961  0.073387  0.073325  0.075916  0.290845   
11    0.073254  0.073293  0.073294  0.073374  0.073311  0.073579

In [17]:
combined_output.to_csv("xgboost_basic_audio.csv", index = None, header=True)