In [1]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

In [2]:
###########################################
#             Preparing Data              #
###########################################

file = open('C:/Users/artur/Downloads/database.csv')
original_file = file.read()

rowsplit_data = original_file.splitlines()
data_examples = np.array([rows.split(',') for rows in rowsplit_data])
data_attributes, data_examples = data_examples[0], data_examples[1:]

print('Original')
print('data_examples shape:', data_examples.shape)
print('data_attributes shape:', data_attributes.shape, end='\n\n')
print(*enumerate(data_attributes), sep='\n', end='\n\n\n')


##################################
#     Forming 'year' feature     #
##################################

def isdigit(s):
    try: 
        int(s)
        return True
    except ValueError:
        return False

years = np.array([year[6:] for year in data_examples[:, 0]])
indicis = []
for i, year in enumerate(years):
    if not isdigit(year):
        indicis.append(i)

data_examples = np.delete(data_examples, indicis, 0)
years = np.delete(years, indicis, 0)

years = np.array([int(year) for year in years])


##################################
#         Clearing Data          #
##################################

unneeded_info = [0, 1, 2, 3, 6, 7, 8, 10, 11, 12, 13, 14, 16]

target = np.array([float(d) for d in data_examples[:, 8]])

latitude = np.array([float(lat) for lat in data_examples[:, 2]])
longitude = np.array([float(lon) for lon in data_examples[:, 3]])

sin_latitude = np.sin(latitude)
sin_longitude = np.sin(longitude)

cos_latitude = np.cos(latitude)
cos_longitude = np.cos(longitude)

data_examples = np.delete(data_examples, unneeded_info, 1)
data_attributes = np.delete(data_attributes, unneeded_info)

data_attributes = np.array(('Years', 'Sin_latitude', 'Sin_longitude', 'Cos_latitude', 'Cos_longitude', *data_attributes))
data_examples = np.column_stack((years, sin_latitude, sin_longitude, cos_latitude, cos_longitude, data_examples))

coder = LabelEncoder()

data_examples[:, 5] = coder.fit_transform(data_examples[:, 5])
data_examples[:, 7] = coder.fit_transform(data_examples[:, 7])
data_examples[:, 8] = coder.fit_transform(data_examples[:, 8])
data_examples[:, 9] = coder.fit_transform(data_examples[:, 9])
data_examples[:, 10] = coder.fit_transform(data_examples[:, 10])
data_examples[:, 11] = coder.fit_transform(data_examples[:, 11])
data_examples[:, 12] = coder.fit_transform(data_examples[:, 12])

print('Cleaned')
print('data_examples shape:', data_examples.shape)
print('data_attributes shape:', data_attributes.shape, end='\n\n')
print(*enumerate(data_attributes), sep='\n')

Original
data_examples shape: (23412, 21)
data_attributes shape: (21,)

(0, 'Date')
(1, 'Time')
(2, 'Latitude')
(3, 'Longitude')
(4, 'Type')
(5, 'Depth')
(6, 'Depth Error')
(7, 'Depth Seismic Stations')
(8, 'Magnitude')
(9, 'Magnitude Type')
(10, 'Magnitude Error')
(11, 'Magnitude Seismic Stations')
(12, 'Azimuthal Gap')
(13, 'Horizontal Distance')
(14, 'Horizontal Error')
(15, 'Root Mean Square')
(16, 'ID')
(17, 'Source')
(18, 'Location Source')
(19, 'Magnitude Source')
(20, 'Status')


Cleaned
data_examples shape: (23409, 13)
data_attributes shape: (13,)

(0, 'Years')
(1, 'Sin_latitude')
(2, 'Sin_longitude')
(3, 'Cos_latitude')
(4, 'Cos_longitude')
(5, 'Type')
(6, 'Depth')
(7, 'Magnitude Type')
(8, 'Root Mean Square')
(9, 'Source')
(10, 'Location Source')
(11, 'Magnitude Source')
(12, 'Status')


In [14]:
model = XGBRegressor(nthread=-1)
model.fit(X=data_examples, y=target)

mean_squared_error(target, model.predict(data_examples))

0.1455959475973734

In [13]:
######################################
#   Here error was almost the same   #
######################################

# parameters = {'gamma':[i/10.0 for i in range(3,6)], 
#               'subsample':[i/10.0 for i in range(6,9)],
#               'colsample_bytree':[i/10.0 for i in range(6,9)], 
#               'max_depth': [2,3,4]}

# clf = GridSearchCV(model, parameters)
# clf.fit(X=data_examples, y=target)

# mean_squared_error(target, clf.best_estimator_.predict(data_examples))