# **ENV**

In [1]:
!pip install catboost==0.24.4 -q

[K     |████████████████████████████████| 65.7 MB 7.7 kB/s 
[?25h

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Copying in the zip files from Google Drive (alternately just upload them)
!cp '/content/drive/MyDrive/Economic_Well_Being_Prediction/data/Train.csv' .
!cp '/content/drive/MyDrive/Economic_Well_Being_Prediction/data/Test.csv' .
!cp '/content/drive/MyDrive/Economic_Well_Being_Prediction/data/SampleSubmission.csv' .

# **IMPORTS**

In [5]:
import numpy as np
import pandas as pd

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import Ridge ,LinearRegression
from tqdm import tqdm_notebook

from sklearn.linear_model import LogisticRegression , Ridge
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import KFold,StratifiedKFold ,GroupKFold
from sklearn.metrics import mean_squared_error

# Plot Packages
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn-darkgrid')

import warnings
warnings.filterwarnings('ignore')

# **Modular Work**

In [9]:
class EconomicWellBeing :

  def get_processed_data(self ,) :
    self.train_path = 'Train.csv'
    self.test_path = 'Test.csv'
    
    train = pd.read_csv(self.train_path)
    test  = pd.read_csv(self.test_path)

    data = pd.concat([train, test]).reset_index(drop=True)
    
    col = ['country', 'year', 'urban_or_rural']
    
    ## Count of unique features
    for i in col:
        data['count_'+i] = data[i].map(data[i].value_counts())
        
    ## Combination features
    data['all_ghsl'] = data['ghsl_built_1975_to_1990']+data['ghsl_built_pre_1975']+data['ghsl_built_1990_to_2000']+data['ghsl_built_2000_to_2014']
    data['all_landcover_fraction'] = data['landcover_crops_fraction']+data['landcover_urban_fraction']
    data['all_waters'] = data['landcover_water_permanent_10km_fraction'] + data['landcover_water_seasonal_10km_fraction']
    
    data['night_bin'] = pd.qcut(data['nighttime_lights'], 8, labels=False, duplicates='drop')

    # get train , test
    train = data[data['ID'].isin(train['ID'].values)]
    test = data[~data['ID'].isin(train['ID'].values)]
    features = [x for x in train.columns if x not in 
                ['ID','country','urban_or_rural','Target','year']]
    return train , test , features
  
  def visualizer(self,continuous_feature1,continuous_feature2) :
    fig, axes = plt.subplots(ncols=2, figsize=(24, 6), dpi=100, constrained_layout=True)
    title_size = 18
    label_size = 18

    sns.scatterplot(x=D[continuous_feature1], y=D['Target'], ax=axes[0])
    axes[0].set_title(f'{continuous_feature1} vs Target', size=title_size, pad=title_size)
    axes[0].set_xlabel('')
    axes[0].set_ylabel('')
    axes[0].tick_params(axis='x', labelsize=label_size)
    axes[0].tick_params(axis='y', labelsize=label_size)

    sns.scatterplot(x=D[continuous_feature2], y=D['Target'], ax=axes[1])
    axes[1].set_title(f'{continuous_feature2} vs Target', size=title_size, pad=title_size)
    axes[1].set_xlabel('')
    axes[1].set_ylabel('')
    axes[1].tick_params(axis='x', labelsize=label_size)
    axes[1].tick_params(axis='y', labelsize=label_size)

    plt.show()

  def get_model(self,Name='catboost') :
    if Name =='catboost' :
      return CatBoostRegressor(learning_rate=0.1,max_depth=4,iterations=2000,
                                colsample_bylevel=0.6,reg_lambda=4,subsample=0.85)
    elif Name=='lgbm' :
      return LGBMRegressor(**{'objective' :'regression','boosting_type' : 'gbdt','metric': 'rmse' ,
                              'learning_rate' : 0.05,'num_iterations': 1500,'max_depth' :4 ,'num_leaves' : 150,
                              'max_bins': 85,'min_data_in_leaf':30,'reg_lambda' :75})
    else :
      return XGBRegressor(objective='reg:tweedie' , eval_metric = 'rmse',n_estimators = 2000,
                             learning_rate = 0.05,max_depth=4)
      
  def CrossValidationTraining(self,KFOLD,EarlyStopping=50,Model_Name='catboost') :
    train , test , features = self.get_processed_data()

    self.X = train[features]
    self.y = train['Target']
    self.test = test[features]

    self.folds = KFOLD
    oofs  = np.zeros((len(self.X)))
    test_predictions = np.zeros((len(self.test)))

    self.Model_Name = Model_Name
    for fold_, (trn_idx, val_idx) in enumerate(self.folds.split(self.X, self.y)):
        
        X_trn, y_trn = self.X.iloc[trn_idx], self.y.iloc[trn_idx]
        X_val, y_val = self.X.iloc[val_idx], self.y.iloc[val_idx]
        

        clf = self.get_model(Name=self.Model_Name)
        clf.fit(X_trn, y_trn, eval_set = [(X_val, y_val)],
                verbose = 0, early_stopping_rounds = EarlyStopping)
        
        vp = clf.predict(X_val)
        oofs[val_idx] = vp
        val_score = mean_squared_error((vp), (y_val),squared=False)
        print(4*'-- -- -- --')
        print(f'Fold {fold_+1} Val score: {val_score}')
        print(4*'-- -- -- --')
        
        tp = clf.predict(self.test)
        test_predictions += tp / self.folds.n_splits

    print()
    print(3*'###',10*"^",3*'###')
    print(mean_squared_error(self.y, oofs,squared=False))
    return test_predictions , oofs
  
  def create_submission(self,predictions,dir_path) :
    _ , test , _ = self.get_processed_data()
    submission = pd.DataFrame()
    submission['ID'] = test['ID']
    submission['Target'] = np.clip(predictions, 0.141022, 0.808657) # clip between 8 percentiles and max predictions
    return submission.to_csv(f'{dir_path}.csv',index=False)

  def StackingData(self ,test_predictions : list , oof_predictions : list) :
    train , _ , _ = self.get_processed_data()
    self.y = train['Target']
    
    stacking_train = pd.DataFrame()
    stacking_train['preds_cat'] =  oof_predictions[0]
    stacking_train['preds_lgb'] =  oof_predictions[1]
    stacking_train['preds_xgb'] =  oof_predictions[2]

    stacking_train['Target'] = self.y

    stacking_test = pd.DataFrame()
    stacking_test['preds_cat'] =  test_predictions[0]
    stacking_test['preds_lgb'] =  test_predictions[1]
    stacking_test['preds_xgb'] =  test_predictions[2]

    return  stacking_train , stacking_test

  def StackingRegressor(self ,KFOLD,test_predictions : list , oof_predictions : list) :
    stacking_train , stacking_test = self.StackingData(test_predictions,oof_predictions)
    
    cols = ['preds_cat', 'preds_xgb', 'preds_lgb']
    X , y , Test = stacking_train[cols] , stacking_train['Target'] , stacking_test[cols]
    final_preds = [] ; err_cb = []
    oof_stack = np.zeros(len(X)) ;
    
    for fold,(train_index, test_index) in enumerate(KFOLD.split(X,y)):
        X_train, X_test = X.values[train_index], X.values[test_index]
        y_train, y_test = y.values[train_index], y.values[test_index]

        model = Ridge(alpha=0.01,random_state=42)
        model.fit(X_train,y_train)
        preds=model.predict(X_test)
        preds = np.clip(preds,a_min=0,a_max=500)
        oof_stack[test_index] = preds
        err_cb.append(mean_squared_error(y_test,preds,squared=False))
        
        test_pred = model.predict(Test.values)
        final_preds.append(test_pred)
    
    print(2*'--------------------------------------')
    print('STACKING RMSE',mean_squared_error(y, oof_stack,squared=False))

    return oof_stack,np.mean(final_preds,axis=0)

In [10]:
economic_well_being = EconomicWellBeing()
folds = KFold(n_splits=10, shuffle=True, random_state=2021)

In [11]:
pred_catboost , oof_catboost = economic_well_being.CrossValidationTraining(KFOLD=folds,EarlyStopping=50,
                                                                           Model_Name='catboost')

-- -- -- ---- -- -- ---- -- -- ---- -- -- --
Fold 1 Val score: 0.08740575480780272
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
Fold 2 Val score: 0.09228390195020524
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
Fold 3 Val score: 0.08252703355562416
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
Fold 4 Val score: 0.08545380313323937
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
Fold 5 Val score: 0.08611435262214312
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
Fold 6 Val score: 0.08526243410953432
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
Fold 7 Val score: 0.08786499994974374
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
Fold 8 Val score: 0.08472134639129786
-- -- -- ---- -- -- -

In [13]:
pred_lgbm , oof_lgbm = economic_well_being.CrossValidationTraining(KFOLD=folds,EarlyStopping=50,
                                                                   Model_Name='lgbm')

-- -- -- ---- -- -- ---- -- -- ---- -- -- --
Fold 1 Val score: 0.08817126840680875
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
Fold 2 Val score: 0.09282579687752102
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
Fold 3 Val score: 0.08235600548633652
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
Fold 4 Val score: 0.08615323141837544
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
Fold 5 Val score: 0.08590692024041607
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
Fold 6 Val score: 0.0850538550185027
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
Fold 7 Val score: 0.08790611641387264
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
Fold 8 Val score: 0.0849329884150518
-- -- -- ---- -- -- ---

In [14]:
pred_xgb , oof_xgb = economic_well_being.CrossValidationTraining(KFOLD=folds,EarlyStopping=100,
                                                                 Model_Name='xgb')

-- -- -- ---- -- -- ---- -- -- ---- -- -- --
Fold 1 Val score: 0.0883724630791374
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
Fold 2 Val score: 0.09255385935372096
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
Fold 3 Val score: 0.08368152932395757
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
Fold 4 Val score: 0.0876399942933459
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
Fold 5 Val score: 0.0869107384396163
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
Fold 6 Val score: 0.0857613829684683
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
Fold 7 Val score: 0.08789134829700239
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
-- -- -- ---- -- -- ---- -- -- ---- -- -- --
Fold 8 Val score: 0.08503122458296207
-- -- -- ---- -- -- ---- 

In [15]:
oofs_pred = [oof_catboost,oof_lgbm,oof_xgb]
test_pred = [pred_catboost,pred_lgbm,pred_xgb]
oof_stack,stack_preds  = economic_well_being.StackingRegressor(KFOLD=folds ,test_predictions=test_pred ,oof_predictions=oofs_pred)

----------------------------------------------------------------------------
STACKING RMSE 0.08598758876647195


In [16]:
economic_well_being.create_submission(stack_preds,'Winning_Solution')