In [48]:
# Global imports
from xgboost import XGBRegressor
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import svm # We will use svm for that
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from datetime import datetime
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import StratifiedKFold

In [49]:
# Internal imports
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from internal.config import config
from internal.dataset_generator import csv_interactor_with_features

In [50]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))


In [51]:
def change_dataset(dataset):
    N_rows = len(dataset)
    N_cols = len(dataset.columns)
    dataset.columns = range(len(dataset.columns))
    first_part = np.array(dataset.iloc[:, :(N_cols // 2)])
    second_part = np.array(dataset.iloc[:, (N_cols // 2):])
    changed_dataset = np.array(np.zeros(first_part.shape))
    
    for i in range(first_part.shape[0]):
        for j in range(first_part.shape[1]):
            if first_part[i,j] > 0 and second_part[i,j] > 0:
                changed_dataset[i, j] = 1.0
            elif second_part[i,j] == 0 and first_part[i, j] == 0:
                changed_dataset[i, j] = 0.0
            else:
                changed_dataset[i, j] = -1.0 
    dataset = pd.DataFrame(changed_dataset)
    return dataset

In [52]:
def read_preprocess_dataset(number_of_rows, number_of_features, max_columns=44541):
    offset = int((max_columns - 3)/2) + 1 #The offset on which we need to move to delete 2nd title
    print(offset)
    inputs = pd.read_csv(config.dataset_with_features_path(), header=None, nrows=number_of_rows, 
                          usecols=list(range(1,number_of_features+1)) + \
                          list(range(offset+1, offset+number_of_features+1)) + [max_columns-1])
    inputs = inputs.sample(frac=1)
    outputs = inputs.loc[:, max_columns-1].to_numpy().astype(np.float32)
    inputs.drop(columns=[max_columns-1], inplace=True)
    inputs = change_dataset(inputs)               
    return (inputs, outputs)

In [53]:
# Reading the dataset
number_of_features = 1000 # number of features after change_dataset
number_of_rows = 10000
inputs, outputs = read_preprocess_dataset(number_of_rows, number_of_features)

22270


In [54]:
#data scaling, mandatory for SVM
std_scaler = StandardScaler()
Stand_Sc = std_scaler.fit_transform(inputs)

In [55]:
# Training our model
params = {'subsample': 0.8, 'min_child_weight': 5, 'max_depth': 5, 'gamma': 10, 'colsample_bytree': 0.8}
errors = []
for i in range(5000, 5000, 50000):
    train_fraction = 0.2
    X_train, X_test, y_train, y_test = train_test_split(inputs[:i], outputs[:i], test_size=train_fraction)
    print(len(X_train))
    clf = XGBRegressor(**params)
    start_time = timer(None)
    clf.fit(X_train, y_train)
    timer(start_time) 
    y_pred = clf.predict(X_test)
    print(mean_squared_error(y_pred, y_test), mean_squared_error(clf.predict(X_train), y_train), 
          mean_squared_error(clf.predict(inputs[i:]), outputs[i:]))

4000

 Time taken: 0 hours 0 minutes and 6.25 seconds.
0.6176883417004281 0.3101392100506049 0.5756502901382156


In [56]:
# Training our model
params = {'subsample': 0.8, 'min_child_weight': 5, 'max_depth': 5, 'gamma': 10, 'colsample_bytree': 0.8}
errors = []
for i in range(5000, 40001, 50000):
    train_fraction = 0.2
    X_train, X_test, y_train, y_test = train_test_split(inputs[:i], outputs[:i], test_size=train_fraction)
    print(len(X_train))
    clf = RandomForestRegressor()
    start_time = timer(None)
    clf.fit(X_train, y_train)
    timer(start_time) 
    y_pred = clf.predict(X_test)
    print(mean_squared_error(y_pred, y_test), mean_squared_error(clf.predict(X_train), y_train), 
          mean_squared_error(clf.predict(inputs[i:]), outputs[i:]))

4000

 Time taken: 0 hours 0 minutes and 13.23 seconds.
0.6138261151757328 0.09221876699654362 0.5622125762117824


In [59]:
print(sorted(clf.feature_importances_))

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,

In [113]:
for i,j in zip(y_pred, y_test):
    print(i, j)

3.6060047 2.0
1.8880942 2.0
2.733419 3.0
2.5973287 3.0
3.59813 4.0
1.974803 1.0
2.5741582 3.0
4.226492 4.0
2.9983518 4.0
2.7594357 3.0
3.197461 3.0
2.478446 2.0
2.2801292 2.0
2.7210584 3.0
2.2329588 2.0
3.8094785 5.0
1.8602926 2.0
3.6576495 4.0
3.4507933 3.0
4.584691 4.0
4.0002317 3.0
2.1319253 2.0
3.3062742 3.0
2.7764204 4.0
1.6420372 2.0
3.3635504 5.0
1.6586932 1.0
2.3346493 3.0
3.3723502 4.0
2.1129692 2.0
4.3044486 5.0
2.8108299 3.0
4.9488516 5.0
4.2820773 5.0
3.0398276 2.0
3.0490582 2.0
2.8946447 3.0
1.2213256 1.0
3.755696 3.0
1.6119981 1.0
3.3738008 3.0
3.4834087 3.0
2.1643243 3.0
2.4785438 2.0
2.5699825 2.0
2.825207 4.0
1.003238 2.0
3.037793 3.0
2.9981644 4.0
3.1515718 1.0
2.6038659 2.0
2.1894054 2.0
3.00308 4.0
2.872859 3.0
3.115599 4.0
4.254298 5.0
2.59895 3.0
3.9607754 3.0
1.5872366 1.0
1.3502702 1.0
2.4950762 3.0
1.4218048 1.0
2.434373 3.0
3.4969525 3.0
2.2271771 2.0
2.794819 2.0
3.3215675 5.0
3.0143216 3.0
3.1839585 4.0
1.4878354 3.0
2.8067083 3.0
4.919351 4.0
4.2130356 5.0


2.1313026 2.0
3.0380821 3.0
2.8742175 1.0
3.9962218 5.0
4.268556 5.0
2.333674 3.0
3.2170527 3.0
2.074114 1.0
4.1998324 3.0
3.9944742 3.0
2.7710512 2.0
3.07849 3.0
3.2195985 4.0
3.2748532 3.0
2.9325354 3.0
2.9661748 2.0
3.0959697 3.0
1.9758387 2.0
1.8746529 2.0
3.4012597 3.0
3.3985271 4.0
2.4524944 1.0
4.1299224 4.0
2.4430234 2.0
2.6875722 3.0
3.2545996 3.0
3.4236982 4.0
2.9119077 2.0
3.0428007 2.0
3.1090596 4.0
3.0121708 2.0
3.7527637 4.0
1.9241726 1.0
1.2713276 1.0
3.2600565 3.0
4.1860642 5.0
3.1864316 4.0
3.4393375 3.0
3.7692547 3.0
3.1221309 3.0
1.5531416 2.0
2.913312 3.0
2.593977 1.0
3.0097148 3.0
3.0306947 4.0
2.2481143 2.0
1.2719412 1.0
3.203743 2.0
2.931835 3.0
2.6597614 3.0
2.4048307 3.0
3.2709577 3.0
2.8570716 3.0
3.3225784 3.0
3.8355093 4.0
4.0023084 4.0
2.8855095 3.0
4.594269 5.0
3.5195923 3.0
2.6945128 2.0
2.9120347 3.0
2.7040038 3.0
1.8825551 1.0
3.7839992 5.0
2.4296439 2.0
3.58839 4.0
2.4682329 3.0
2.2703438 1.0
2.8424695 3.0
2.1849718 1.0
3.195039 3.0
3.0051363 4.0
4.147

3.9952087 3.0
2.7188668 3.0
3.695723 4.0
3.1910658 3.0
2.2783818 3.0
3.3619049 3.0
3.3137395 4.0
3.882178 1.0
2.96084 3.0
1.694426 2.0
2.9494665 2.0
2.2862597 2.0
2.8299825 4.0
2.6151292 3.0
3.063267 4.0
2.3210557 2.0
2.465502 2.0
2.7486587 4.0
2.1769643 2.0
1.7109028 2.0
2.5885746 2.0
1.9956212 2.0
3.1248398 4.0
3.1755996 2.0
2.5603704 1.0
3.1595445 4.0
3.1660686 3.0
4.0171843 5.0
1.806601 2.0
1.7856692 3.0
2.46962 2.0
3.1986094 3.0
2.8361948 3.0
2.789898 3.0
2.8997803 3.0
2.0602438 2.0
2.2828758 3.0
4.2989635 4.0
3.3945887 3.0
3.045709 4.0
3.9176943 4.0
3.1233053 4.0
2.7956839 3.0
2.944209 3.0
4.0905185 5.0
3.1066663 3.0
2.7278335 3.0
2.5215895 3.0
2.6712604 1.0
1.6807404 1.0
3.967357 3.0
3.4398615 3.0
2.6403036 3.0
3.5423331 4.0
2.836276 3.0
2.8235269 5.0
2.6919804 3.0
4.0123982 4.0
4.431856 6.0
4.8123 5.0
2.812691 2.0
3.2546303 4.0
2.3417146 2.0
2.824616 2.0
2.4556153 1.0
2.4704287 2.0
3.262345 4.0
2.160694 3.0
4.0980186 5.0
1.730012 2.0
3.150118 4.0
4.4971743 5.0
3.040783 3.0
2.69

In [114]:
y_pred.mean()

2.900883

# Grid Search

In [17]:
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

In [19]:
xgb = XGBRegressor(learning_rate=0.02, n_estimators=600,
                    silent=True, nthread=1)

In [23]:
folds = 2
param_comb = 5

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb,  n_jobs=-1,
                                   cv=skf.split(inputs,outputs), verbose=3, random_state=1001 )


start_time = timer(None)
random_search.fit(inputs, outputs)
timer(start_time) 

Fitting 2 folds for each of 5 candidates, totalling 10 fits
Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[CV 1/2] END colsample_bytree=0.8, gamma=1.5, max_depth=5, min_child_weight=1, subsample=0.6;, score=0.447 total time= 3.1min
Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, 

In [24]:
print(random_search.best_params_)

{'subsample': 0.8, 'min_child_weight': 5, 'max_depth': 5, 'gamma': 1, 'colsample_bytree': 0.8}


In [26]:
best_estimator = random_search.best_estimator_

Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[CV 2/2] END colsample_bytree=1.0, gamma=5, max_depth=3, min_child_weight=5, subsample=1.0;, score=0.403 total time= 2.3min
Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[CV 1/2] END colsample_bytree=0.8, gamma=1, max_depth=5, min_child_weight=5, subsample=0.8;, score=0.448 total time= 3.2min
Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bind

In [None]:
mean_squared_error.predict()

# Sandbox

In [91]:
a

Unnamed: 0,0,1,2,3,4
2,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0
0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0
