In [1]:
import os
import time
from math import sqrt
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Load Data

In [2]:
train = pd.read_csv('../input/duth-dbirlab2-1/train.csv')
test = pd.read_csv('../input/duth-dbirlab2-1/test.csv')

## Encode Categorical

Χρήση παραλλαγής της κωδικοποίησης categorical features, ώστε να έχουμε πιο εύλογη αντικατάσταση των ειδών του εδάφους και όχι τυχαία ανάθεση τιμών

In [None]:
# Custom substrate labelling
for i in range (0,train['substrateType'].values.size):
    if train['substrateType'].values[i] == "Unknown":
        train['substrateType'].values[i] = 0
    if train['substrateType'].values[i] == "Rock or other hard substrata":
        train['substrateType'].values[i] = 1
    if train['substrateType'].values[i] == 'Coarse and mixed sediment':
        train['substrateType'].values[i] = 2
    if train['substrateType'].values[i] == 'Fine mud':
        train['substrateType'].values[i] = 3
    if train['substrateType'].values[i] == 'Sandy mud':
        train['substrateType'].values[i] = 4
    if train['substrateType'].values[i] == 'Muddy sand':
        train['substrateType'].values[i] = 5
    if train['substrateType'].values[i] == 'Sand':
        train['substrateType'].values[i] = 6
    if train['substrateType'].values[i] == 'Cymodocea nodosa meadows':
        train['substrateType'].values[i] = 7
    if train['substrateType'].values[i] == 'Posidonia oceanica meadows':
        train['substrateType'].values[i] = 8
      

Πρόχειρη διαγραφή τιμών outiers και συγκεκριμένα αυτών με πολύ μεγάλη τιμή, δηλαδή μεγαλύτερη της 10^5

In [None]:
excludeList = []
for col in train.columns:    
    r = train[col]
    a = np.array(r)
#     if abs(np.percentile(a,100))>1000*abs(np.percentile(a,99)):
    for i in range(0,len(a)):
        if abs(a[i])>1e5:
            excludeList.append(i)
            train[col].values[i] = np.median(a)
print(excludeList)
print(len(excludeList))

Παραλλαγή κλασσικής μεθόδου feature engineering binning, προσασμοσμένη στα δεδομένα μας

In [None]:
#custom binning
def customBin(element,df,lb,ub):
    index = df.columns.get_loc(element)
    new_list = []
    for row in df.values:
        if row[index]>lb and row[index]<ub:
            new_list.append(1)
        else:
            new_list.append(0) 
    
    new_df = pd.DataFrame(data=new_list,columns=[element+"_cb"])
    return new_df

Ακολουθεί και η εφαρμογή της μεθόδου binning σε συγκεκριμένες στήλες μόνο του dataset, όπως καταλήξαμε ότι συνίσταται από την έρευνα μας

In [None]:
print(train.shape)
cb = customBin('temperatureSurface_quantile_5',train,6,16)
train = pd.concat([train,cb],axis=1)
cb = customBin('temperatureSurface_quantile_5',test,6,16)
test = pd.concat([test,cb],axis=1)

cb = customBin('Center Lat',train,32.5,46)
train = pd.concat([train,cb],axis=1)
cb = customBin('Center Lat',test,32.5,46)
test = pd.concat([test,cb],axis=1)

cb = customBin('majorRiversScale',train,6,10)
train = pd.concat([train,cb],axis=1)
cb = customBin('majorRiversScale',test,6,10)
test = pd.concat([test,cb],axis=1)

cb = customBin('Center Long',train,-10,26)
train = pd.concat([train,cb],axis=1)
cb = customBin('Center Long',test,-10,26)
test = pd.concat([test,cb],axis=1)

cb = customBin('dissolvedOxygen100_300_Moving_skew_3_mean',train,-10,-4)
train = pd.concat([train,cb],axis=1)
cb = customBin('dissolvedOxygen100_300_Moving_skew_3_mean',test,-10,-4)
test = pd.concat([test,cb],axis=1)

cb = customBin('dissolvedOxygenSurface_Moving_skew_6_mean',train,-5,-2)
train = pd.concat([train,cb],axis=1)
cb = customBin('dissolvedOxygenSurface_Moving_skew_6_mean',test,-5,-2)
test = pd.concat([test,cb],axis=1)

cb = customBin('nitrate100_300_Expanding_skew_mean',train,-10,-4)
train = pd.concat([train,cb],axis=1)
cb = customBin('nitrate100_300_Expanding_skew_mean',test,-10,-4)
test = pd.concat([test,cb],axis=1)

cb = customBin('secchiDiskDepth_Moving_skew_3_mean',train,-150,-80)
train = pd.concat([train,cb],axis=1)
cb = customBin('secchiDiskDepth_Moving_skew_3_mean',test,-150,-80)
test = pd.concat([test,cb],axis=1)

cb = customBin('bathymetry',train,0,-400)
train = pd.concat([train,cb],axis=1)
cb = customBin('bathymetry',test,0,-400)
test = pd.concat([test,cb],axis=1)


Στην συνέχεια της εφαρμογής του feature selection μας, θα επιλέξουμε να κρατήσουμε μόνο συγκεκριμένες στήλες απο το dataframe μας. Αρχικά κάνουμε drop όσες στήλες έχουνε numerical correlation με το ζητούμενο μας μικρότερο του 0.3 

In [None]:
num=train.select_dtypes(exclude='object')
numcorr=num.corr()
Num=abs(numcorr['Overall Probability']).sort_values(ascending=True)

NumF=Num[Num<0.3]
npd = NumF.to_frame()
npd = npd.transpose()
cols_to_exclude = list(npd.columns)
print(len(cols_to_exclude))

Σε συνεχεια του προηγούμενου block, επιλέγουμε να ρίξουμε ενδεικτικά τις 1000 στήλες με τις χειρότερες τυπικές αποκλίσεις

In [None]:
stds_train=train.std(axis=0)
indexes = np.argsort(stds_train)
lim = 0
# cols_to_exclude = []
for i in indexes:
    lim +=1
    if lim > 1000:
        break
    cols_to_exclude.append(train.columns[i])


In [None]:
for df in [train,test]:
    for c in df.drop(['obs_id'],axis=1):
        if (df[c].dtype=='object'):
            lbl = LabelEncoder() 
            lbl.fit(list(df[c].values))
            df[c] = lbl.transform(list(df[c].values))

Ακολουθεί μία μέθοδος optimization των μεγεθών dataframes, για βελτιστοποίηση της προσπέλασης τους

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

## Train Models

In [None]:
# Some useful parameters which will come in handy later on
ntrain = train.shape[0] # or len(train)
ntest = test.shape[0] # or len(test)
SEED = 3999 # for reproducibility
NFOLDS = 10 # set folds for out-of-fold prediction
folds = KFold(n_splits= NFOLDS, random_state=SEED, shuffle=True)

In [None]:
# cols_to_exclude = ['obs_id','Overall Probability','median','quantile_1','quantile_10','quantile_90','quantile_99']
cols_to_exclude.append('skew')
cols_to_exclude.append('std')
cols_to_exclude.append('kurt')
cols_to_exclude.append('median')
cols_to_exclude.append('dissolvedOxygenSurface_mad')
cols_to_exclude.append('euphoticDepth_mad')
cols_to_exclude.append('euphoticDepthiqr')
cols_to_exclude.append('euphoticDepthiqr1')
cols_to_exclude.append('quantile')

cols_to_exclude_default = ['obs_id','Overall Probability']
dftc = []
ex = False
for c in train.columns:
    for col2e in cols_to_exclude:
        if col2e in c:
            ex = True
    
    if not ex:
        dftc.append(c)
    else:
        ex = False
        
dftc = [c for c in dftc if c not in cols_to_exclude_default]
dftc.append('distanceToCoast')
dftc.append('majorRiversScale')
dftc.append('temperatureSurface_quantile_5_cb')
dftc.append('Center Lat_cb')
dftc.append('Center Long_cb')
dftc.append('majorRiversScale_cb')


# r_train = reduce_mem_usage(train)
# r_test = reduce_mem_usage(test)
        
y_train = train['Overall Probability'].ravel() #ravel coverts a series to a numpy array
x_train = train[dftc].values # converts a dataframe to a numpy array
x_test = test[dftc].values
print(x_train.shape)

In [None]:
print(dftc)

In [None]:
def train_model(X_train, X_test, Y_train, folds=5, model_type='lgb',plot_feature_importance=True):

    oof = np.zeros(ntrain)
    prediction = np.zeros(ntest)
    scores = []
    feature_importance = pd.DataFrame()
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X_train,Y_train)):
        print('Fold', fold_n+1, 'started at', time.ctime())
        x_train, x_valid = X_train[train_index], X_train[valid_index]
        y_train, y_valid = Y_train[train_index], Y_train[valid_index]      
        
        if model_type == 'linear':
            model = LinearRegression()
            model.fit(x_train, y_train)
            y_pred_valid = model.predict(x_valid)
            y_pred = model.predict(X_test) 
            
        if model_type == 'rf':
            model = RandomForestRegressor(min_weight_fraction_leaf=0.05,n_jobs=-2,random_state=0,
                                          max_depth=4,
                                         n_estimators=100)
            model.fit(x_train, y_train)
            y_pred_valid = model.predict(x_valid)
            y_pred = model.predict(X_test)               
        
        if model_type == 'lgb':
            lgb_params = {   
                         'num_leaves':20,
                         'min_data_in_leaf': 20,
                         'min_sum_hessian_in_leaf': 11,
                         'objective': 'regression',
                         'max_depth': 20,
                         'learning_rate': 0.005,
                         'boosting': "gbdt",
                         'feature_fraction': 0.8,
                         'feature_fraction_seed': 9,
                         'max_bin ': 1000,
                         "bagging_freq": 5,
                         "bagging_fraction": 0.8,
                         "bagging_seed": 9,
                         'metric': 'rmse',
                         'lambda_l1': 0.1,
                         'verbosity': -1,
                         'min_child_weight': 5.34,
                         'reg_alpha': 1.130,
                         'reg_lambda': 0.360,
                         'subsample': 0.8,
                         }
            
            
            model = lgb.LGBMRegressor(**lgb_params, n_estimators = 100000, n_jobs = -1)
            model.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_valid, y_valid)], eval_metric='rmse',verbose=10000, early_stopping_rounds=1000)
            
            y_pred_valid = model.predict(x_valid)
            y_pred_valid = np.clip(y_pred_valid, a_min=0, a_max=1)
            y_pred = model.predict(X_test, num_iteration=model.best_iteration_)
            y_pred = np.clip(y_pred, a_min=0, a_max=1)
            
            # feature importance
            fold_importance = pd.DataFrame()
            fold_importance["feature"] = train[dftc].columns
            fold_importance["importance"] = model.feature_importances_
            fold_importance["fold"] = fold_n + 1
            feature_importance = pd.concat([feature_importance, fold_importance], axis=0)            

        
        oof[valid_index] = y_pred_valid.reshape(-1,)
        scores.append(mean_squared_error(y_valid, y_pred_valid) ** 0.5)
        prediction += y_pred          
        
    if (model_type == 'lgb' and plot_feature_importance==True):

        cols = feature_importance[["feature", "importance"]].groupby("feature").mean().sort_values(
            by="importance", ascending=False)[:50].index

        best_features = feature_importance.loc[feature_importance.feature.isin(cols)]

        plt.figure(figsize=(16, 12));
        sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False));
        plt.title('LGB Features (avg over folds)')

    prediction /= NFOLDS        
    print('CV mean score: {0:.5f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
    

    
    return oof, prediction

In [None]:
oof, prediction = train_model(X_train=x_train, X_test=x_test, Y_train=y_train, folds=folds, model_type='lgb', plot_feature_importance=True)

## Create Submission File

In [None]:
sample_submission = pd.read_csv('../input/duth-dbirlab2-1/sample_submission.csv')
sub_df = pd.DataFrame({"obs_id":sample_submission["obs_id"].values})
sub_df["Overall Probability"] = prediction
sub_df["Overall Probability"] = sub_df["Overall Probability"].apply(lambda x: 1 if x>1 else 0 if x<0 else x)
sub_df.to_csv("submission.csv", index=False)