In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import random
import numpy as np

np.random.seed(16783983)
random.seed(1678839)
pd.set_option("display.max_rows", None, "display.max_columns", None)

In [2]:
#train=pd.read_csv("original datasets/train.csv")
#test=pd.read_csv("original datasets/test.csv")

In [3]:
train=pd.read_csv("datasets/combined_interpol_knn3/train.csv")
test=pd.read_csv("datasets/combined_interpol_knn3/test.csv")

In [4]:
galaxies_train=train.galaxy.unique().tolist()
galaxies_test=test.galaxy.unique().tolist()
common_galaxies=list(set(galaxies_train) & set(galaxies_test))

In [5]:
def divide_galaxies(common_galaxies, train):
    increasing_galaxies=[]
    decreasing_galaxies=[]
    
    for galaxy in common_galaxies:
        subset=train.loc[train.galaxy==galaxy, :]
        subset=subset.sort_values('galactic year')
        first=subset.iloc[[0], [-1]]
        last=subset.iloc[[-1], [-1]]
        
        if last['y'].to_list()[0]>first['y'].to_list()[0]:
            increasing_galaxies.append(galaxy)
        else:
            decreasing_galaxies.append(galaxy)
    return increasing_galaxies, decreasing_galaxies
            

In [None]:
def ml_loop(galaxy_data, test_data, param):    
    labels=galaxy_data['y']
    
    galaxy_data=galaxy_data.drop('y', axis=1)
    galaxy_data=galaxy_data.drop('galaxy', axis=1)
    test_data=test_data.drop('galaxy', axis=1)
    
    #imp=SimpleImputer()
    #galaxy_data=pd.DataFrame(imp.fit_transform(galaxy_data), index=galaxy_data.index)
    
    data = train_test_split(galaxy_data, labels, test_size=0.2, shuffle=True)
    X_train, X_valid, Y_train, Y_valid = data
    

    
    #dtrain = xgb.DMatrix(X_train, label=Y_train)
    #deval = xgb.DMatrix(X_valid, label=Y_valid)
    
   
      
    model = xgb.XGBRegressor(**param)
    model.fit(X_train, Y_train, eval_set=[(X_train, Y_train),(X_valid, Y_valid)], eval_metric='rmse', early_stopping_rounds=1, verbose=False)
    progress=model.evals_result()
    y_pred = model.predict(test_data, ntree_limit=model.best_ntree_limit)
    test_data['y']=y_pred
    
    return test_data, progress
    

In [None]:
def train_f(common_galaxies, train, test, param):
    performance={}
    predicted_galaxies=[]
    
    for galaxy in common_galaxies:
        train_subset=train.loc[train.galaxy==galaxy, :]
        test_subset=test.loc[test.galaxy==galaxy,:]
        
        test_subset_predicted, progress=ml_loop(train_subset, test_subset, param)
        
        train_performance=progress['validation_0']['rmse'][-1]
        eval_performance=progress['validation_1']['rmse'][-1]
        
        performance[galaxy]=[train_performance, eval_performance]
        
        test_subset_predicted.insert(1, 'galaxy',galaxy)
        predicted_galaxies.append(test_subset_predicted)
        
    test_predicted=pd.concat(predicted_galaxies)  
    
    return test_predicted.sort_index(), performance
            

In [None]:
increasing_galaxies, decreasing_galaxies = divide_galaxies(common_galaxies, train) #increasing == bad

In [None]:
paramIncreasing = {'max_depth': 25, 
         'n_estimators': 5000,
         'gamma':0,
         'eta': 0.3, 
         'objective': 'reg:squarederror', 
         'min_child_weight':0.1,
         'colsample_bytree': 1,
         'colsample_bylevel':1,
         'importance_type': 'weight',
         'subsample':1,
         'lambda':0.01,
         'num_parallel_tree':1,
         'random_state':12
        }

In [None]:
test_pred_Inc, performance_Inc = train_f(increasing_galaxies, train, test, paramIncreasing)

In [None]:
performance_Inc = pd.DataFrame.from_dict(performance_Inc, orient='index', columns=['train_rmse', 'eval_rmse'])
performance_Inc=performance_Inc.sort_values('eval_rmse', ascending=False)

In [None]:
performance_Inc.head(-1)

In [None]:
performance_Inc.describe()

In [None]:
paramDecreasing = {'max_depth': 25, 
         'n_estimators': 5000,
         'gamma':0,
         'eta': 0.3, 
         'objective': 'reg:squarederror', 
         'min_child_weight':0.1,
         'colsample_bytree': 1,
         'colsample_bylevel':1,
         'importance_type': 'cover',
         'subsample':1,
         'lambda':0.01,
         'num_parallel_tree':1,
         'random_state':12
        }

In [None]:
test_pred_Dec, performance_Dec = train_f(decreasing_galaxies, train, test, paramDecreasing)

In [None]:
performance_Dec = pd.DataFrame.from_dict(performance_Dec, orient='index', columns=['train_rmse', 'eval_rmse'])
performance_Dec = performance_Dec.sort_values('eval_rmse', ascending=False)

In [None]:
performance_Dec.head(-1)

In [None]:
performance_Dec.describe()

In [None]:
###########################################################################################

In [None]:
test_pr=test.copy()

In [None]:
test_pr['y']=0

In [None]:
test_pr.iloc[test_pred_Dec.index.to_list(), :]=test_pred_Dec

In [None]:
test_pr.iloc[test_pred_Inc.index.to_list(), :]=test_pred_Inc

In [None]:
test_pr.head(-1)

In [None]:
test.head(-1)

In [None]:
test_pr.to_csv("test_predicted.csv", index=False)