In [13]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import gc

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics, model_selection

from lightgbm import LGBMRegressor

%matplotlib inline

In [2]:
data = pd.read_csv('dataset/abalone.data',header=None,names=['Sex','Length','Diameter','Height','Whole-weight','Shucked-weight','Viscera-weight','Shell-weight','Rings'])
data.loc[(data['Sex']=="M"),'Sex']=0
data.loc[(data['Sex']=="F"),'Sex']=1
data.loc[(data['Sex']=="I"),'Sex']=2
n = len(data)+1
nlist = range(1,n)
data['id'] = nlist
# data['Rings'] = np.log(data['Rings'])
data

Unnamed: 0,Sex,Length,Diameter,Height,Whole-weight,Shucked-weight,Viscera-weight,Shell-weight,Rings,id
0,0,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15,1
1,0,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7,2
2,1,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9,3
3,0,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10,4
4,2,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7,5
...,...,...,...,...,...,...,...,...,...,...
4172,1,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11,4173
4173,0,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10,4174
4174,0,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9,4175
4175,1,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10,4176


In [3]:
col_features = ['id','Length','Diameter','Height','Whole-weight','Shucked-weight','Viscera-weight','Shell-weight','Rings']
data = data[col_features]
# data_features = data[col_features]
# data_label = data['Rings']
# train_x,test_x,train_y,test_y = train_test_split(data_features,data_label,test_size=0.2,random_state=0)
train,test = train_test_split(data,test_size=0.2,random_state=0)

In [4]:
print('Train data shape', train.shape)
print('Test data shape', test.shape)

Train data shape (3341, 9)
Test data shape (836, 9)


In [5]:
y = np.log(train['Rings']+1).values.astype(np.float)
id_test = np.array(test['id'])

In [6]:
df = train.append(test, ignore_index=True)
del test, train
gc.collect()

print('Merged data shape', df.shape)

Merged data shape (4177, 9)


In [7]:
df.drop(labels=['id','Rings'],axis=1,inplace=True)
feature_list = df.columns.tolist()

In [8]:
df

Unnamed: 0,Length,Diameter,Height,Whole-weight,Shucked-weight,Viscera-weight,Shell-weight
0,0.180,0.135,0.080,0.0330,0.0145,0.0070,0.0100
1,0.215,0.150,0.055,0.0410,0.0150,0.0090,0.0125
2,0.660,0.530,0.170,1.3905,0.5905,0.2120,0.4530
3,0.715,0.525,0.200,1.8900,0.9500,0.4360,0.4305
4,0.595,0.455,0.155,1.0410,0.4160,0.2105,0.3650
...,...,...,...,...,...,...,...
4172,0.610,0.475,0.140,1.1330,0.5275,0.2355,0.3500
4173,0.410,0.325,0.120,0.3745,0.1580,0.0810,0.1250
4174,0.445,0.345,0.105,0.4090,0.1675,0.1015,0.1170
4175,0.540,0.435,0.180,0.9960,0.3835,0.2260,0.3250


In [9]:
print('train-test split')
df_train,df_test = df.iloc[:len(y)], df.iloc[len(y):]
del df
gc.collect()

print('train-validation split \n')
X = df_train.values
X_train,X_valid, y_train, y_valid = model_selection.train_test_split(X,y,test_size=0.2,random_state=42)
X_test = df_test.values

del df_train,df_test
gc.collect()

print('train shape',X_train.shape)
print('Validation shape',X_valid.shape)
print('Test shape',X_test.shape)

train-test split
train-validation split 

train shape (2672, 7)
Validation shape (669, 7)
Test shape (836, 7)


In [10]:
seed = 42

gbmr = LGBMRegressor(
#     exec_path='/path/to/your/LightGBM/lightgbm', # change this to your LighGBM path
    config='',
    application='regression',
    num_iterations=500,
    learning_rate=0.1,
    num_leaves=10,
    tree_learner='serial',
    num_threads=4,
    min_data_in_leaf=10,
    metric='l2',
    feature_fraction=1.0,
    feature_fraction_seed=seed,
    bagging_fraction=1.0,
    bagging_freq=0,
    bagging_seed=seed,
    metric_freq=1,
    early_stopping_round=10
)

In [11]:
gbmr.fit(X_train, y_train, test_data=[(X_valid, y_valid)])
print("Mean Square Error:", metrics.mean_absolute_error(y_true=(np.exp(y_valid)-1), y_pred=(np.exp(gbmr.predict(X_valid))-1)))

TypeError: fit() got an unexpected keyword argument 'test_data'