In [32]:
import sys
import time
import pandas as pd
from xgboost import XGBRegressor
from matplotlib import pyplot as plt
import os
import pickle
import numpy as np
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import r2_score

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
start_time = time.time()

d_set = "294_satellite_image.tsv"
s_factor = 0.25
model_file = "xgboost_model.pkl"
cv_split = 5

regressor = Pipeline([
  ('regression', XGBRegressor(warm_start=True))
])

parameters = [{ \
    #'regression__max_depth': [1, 2, 3, 4, 5], \
    #'regression__n_estimators': [10, 50, 100, 200], \
    #'regression__booster': ['gbtree', 'gblinear', 'dart'], \
    'regression__random_state': [3111696] \
}]

dataset_accuracies = list()
r2_scores = list()

data_path = "data/" + d_set
df = pd.read_csv(data_path, sep="\t")

# randomize the dataframe
df.sample(frac=1)

# divide datasets
n_rows = len(df)
test_dataset = df[: 200]
shortened_dataset1 = df[200: int(s_factor*n_rows)]
shortened_dataset2 = df[int(s_factor*n_rows):]

def train_model(dataset_frame, retrain):
    label = dataset_frame["target"].copy()
    data = dataset_frame.drop("target", axis=1)
    print("Dataset size: %d" % len(label))
    if retrain is False:
        optimized_regressor = GridSearchCV(regressor, parameters, \
            cv=KFold(n_splits=cv_split, shuffle=True, random_state=3111696), \
            error_score=0,
            scoring='r2',
            refit=True
        )
        optimized_regressor.fit(data, label)
        best_regressor = optimized_regressor.best_estimator_
        with open(model_file, "wb") as output_handler:
            pickle.dump(best_regressor, output_handler, pickle.HIGHEST_PROTOCOL)
    else:
        with open(model_file, 'rb') as model_handler:
            best_regressor = pickle.load(model_handler)
        best_regressor.fit(data, label)
    
    # evaluate on test data
    test_label = test_dataset["target"].copy()
    test_data = test_dataset.drop("target", axis=1)
    predictions = best_regressor.predict(test_data)

    r_squared_score = r2_score(test_label, predictions)
    print("Test R-square: %0.2f" % r_squared_score)
    
print("Start first training...")
train_model(shortened_dataset2, False)
print('Training finished')
print("")
print("Start training with new dataset...")
train_model(shortened_dataset1, True)
print('Re-training finished')

end_time = time.time()
print('Total time taken: %d seconds' % int(end_time - start_time))

Start first training...
Dataset size: 4827
Test R-square: 0.52
Training finished

Start training with new dataset...
Dataset size: 1408
Test R-square: 0.28
Re-training finished
Total time taken: 1 seconds
