<a href="https://colab.research.google.com/github/antonysama/Projects_w._time_series/blob/master/Fisheries_feature_selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

References:
https://github.com/solegalli/feature-selection-for-machine-learning/blob/master/06-Filter-other-metrics/06.2-Method-used-in-a-KDD-competition.ipynb

In [None]:
#import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# load dataset
fields = ['SPECIES','STORAGE_TYPE','GEAR_NME','MGMT_AREA','LICENCE_AREA','DATE','Vessel ID Number','GEAR_CDE','SUB_AREA'
,'SA_ORIGIN','Nominal_Value','KG_PIECE','Nominal_PRICE_PIECE']
data=pd.read_csv('drive/MyDrive/q2.csv', error_bad_lines=False,usecols=fields)
print(data.shape)
data.head(2)

(566739, 13)


  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,SPECIES,STORAGE_TYPE,GEAR_NME,MGMT_AREA,LICENCE_AREA,DATE,Vessel ID Number,GEAR_CDE,SUB_AREA,SA_ORIGIN,KG_PIECE,Nominal_PRICE_PIECE,Nominal_Value
0,CHUM SALMON,UNKNOWN,SEINE,13,SALMON AREA B,10/17/2016,4856,20,28,LOG_BOOK,4.391591,8.540229,227298.1995
1,CHUM SALMON,UNKNOWN,SEINE,14,SALMON AREA B,11/6/2011,3980,20,4,LOG_BOOK,4.595482,8.61152,215287.9969


In [None]:
# Disaggregate the date 
data[["month", "day", "year"]] = data["DATE"].str.split("/", expand = True)
data.drop(['DATE'], axis=1 , inplace=True) # nex time, don't drop 'day'
data.head(2)


Unnamed: 0,SPECIES,STORAGE_TYPE,GEAR_NME,MGMT_AREA,LICENCE_AREA,Vessel ID Number,GEAR_CDE,SUB_AREA,SA_ORIGIN,KG_PIECE,Nominal_PRICE_PIECE,Nominal_Value,month,day,year
0,CHUM SALMON,UNKNOWN,SEINE,13,SALMON AREA B,4856,20,28,LOG_BOOK,4.391591,8.540229,227298.1995,10,17,2016
1,CHUM SALMON,UNKNOWN,SEINE,14,SALMON AREA B,3980,20,4,LOG_BOOK,4.595482,8.61152,215287.9969,11,6,2011


In [None]:
data.isnull().mean()

SPECIES                0.0
STORAGE_TYPE           0.0
GEAR_NME               0.0
MGMT_AREA              0.0
LICENCE_AREA           0.0
Vessel ID Number       0.0
GEAR_CDE               0.0
SUB_AREA               0.0
SA_ORIGIN              0.0
KG_PIECE               0.0
Nominal_PRICE_PIECE    0.0
Nominal_Value          0.0
month                  0.0
day                    0.0
year                   0.0
dtype: float64

In [None]:
# Inspected the type of variables in pandas, 
# change as appropriate
for col in ['MGMT_AREA','GEAR_CDE','SUB_AREA']:
    data[col] = data[col].astype(object)
for col in ['month','day','year']:
    data[col] = data[col].astype(int)
data.dtypes

SPECIES                 object
STORAGE_TYPE            object
GEAR_NME                object
MGMT_AREA               object
LICENCE_AREA            object
Vessel ID Number        object
GEAR_CDE                object
SUB_AREA                object
SA_ORIGIN               object
KG_PIECE               float64
Nominal_PRICE_PIECE    float64
Nominal_Value          float64
month                    int64
day                      int64
year                     int64
dtype: object

In [None]:
# find categorical variables
categorical = [var for var in data.columns if data[var].dtype=='O']
print('There are {} categorical variables'.format(len(categorical)))
categorical

There are 9 categorical variables


['SPECIES',
 'STORAGE_TYPE',
 'GEAR_NME',
 'MGMT_AREA',
 'LICENCE_AREA',
 'Vessel ID Number',
 'GEAR_CDE',
 'SUB_AREA',
 'SA_ORIGIN']

In [None]:
# make a list of the numerical variables first
numerical = [var for var in data.columns if data[var].dtype!='O']
numerical

['KG_PIECE', 'Nominal_PRICE_PIECE', 'Nominal_Value', 'month', 'day', 'year']

### Feature Seclection on Categorical Variables

In [None]:
# train-test split
#Include the determinant (target variable) in X and in Y

X_train, X_test, y_train, y_test = train_test_split(
    data[categorical+['Nominal_Value']],
    data['Nominal_Value'],
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((396717, 10), (170022, 10))

###Replace categories by target variable mean for each variable

In [None]:
# function that determines the target mean per category

def mean_encoding(df_train, df_test, categorical):
    
    # temporary copy of the original dataframes
    df_train_temp = df_train.copy()
    df_test_temp = df_test.copy()
    
    # iterate over each variable
    for col in categorical:
        
        # make a dictionary of categories, target-mean pairs
        target_mean_dict = df_train.groupby([col])['Nominal_Value'].mean().to_dict()
        
        # replace the categories by the mean of the target
        df_train_temp[col] = df_train[col].map(target_mean_dict)
        df_test_temp[col] = df_test[col].map(target_mean_dict)
    
    # drop the target from the daatset
    df_train_temp.drop(['Nominal_Value'], axis=1, inplace=True)
    df_test_temp.drop(['Nominal_Value'], axis=1, inplace=True)
    
    # return  remapped datasets
    return df_train_temp, df_test_temp

In [None]:
X_train_enc, X_test_enc = mean_encoding(X_train, X_test, categorical)

X_train_enc.head(2)

Unnamed: 0,SPECIES,STORAGE_TYPE,GEAR_NME,MGMT_AREA,LICENCE_AREA,Vessel ID Number,GEAR_CDE,SUB_AREA,SA_ORIGIN
206832,329.345449,255.3737,251.201689,155.40082,247.201798,236.121252,255.3737,346.719967,107.980496
317348,180.609684,255.3737,251.201689,180.97203,224.081796,133.210774,255.3737,336.894549,107.980496


In [None]:
X_test_enc.head(2)

Unnamed: 0,SPECIES,STORAGE_TYPE,GEAR_NME,MGMT_AREA,LICENCE_AREA,Vessel ID Number,GEAR_CDE,SUB_AREA,SA_ORIGIN
253449,1336.932097,247.574219,251.201689,1230.927365,778.415119,197.940963,247.574219,2242.523098,107.980496
22329,1336.932097,799.097647,386.870942,1140.309955,637.259081,649.823505,386.870942,1138.786652,857.256263


####The strings were replaced by the target mean.

###Determine the r2 using the variable values as input

In [None]:
#Missing values in the test set are due to some observations being outside the range of train set
X_test_enc.isnull().mean()

SPECIES             0.000000
STORAGE_TYPE        0.000000
GEAR_NME            0.000000
MGMT_AREA           0.000000
LICENCE_AREA        0.000000
Vessel ID Number    0.000012
GEAR_CDE            0.000000
SUB_AREA            0.000012
SA_ORIGIN           0.000000
dtype: float64

In [None]:
# fill missing values with 0 for expediency
X_test_enc.fillna(0, inplace=True)

In [None]:
# now, we calculate a r2 score, using the encoded variables # as predictions
r2 = []

for feature in categorical:
    
    r2.append(r2_score(y_test, X_test_enc[feature]))  

In [None]:
# I make a series for easy visualisation

m1 = pd.Series(r2)
m1.index = categorical
m1.sort_values(ascending=False)

LICENCE_AREA        0.133390
Vessel ID Number    0.127335
GEAR_CDE            0.106127
GEAR_NME            0.106124
SPECIES             0.033354
MGMT_AREA           0.029242
SA_ORIGIN           0.024671
STORAGE_TYPE        0.014195
SUB_AREA            0.013440
dtype: float64

###Feature Selection on numerical variables
#####The procedure is exactly the same, but it requires one additional first step which is to divide the continuous variable into bins.

In [None]:
# separate train and test sets
#Include the determinant (target variable) in X and in Y
X_train, X_test, y_train, y_test = train_test_split(
    data[['KG_PIECE', 'Nominal_PRICE_PIECE', 'month', 'day', 'year','Nominal_Value']],
    data['Nominal_Value'],
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((396717, 6), (170022, 6))

In [None]:
y_test.head(2)

253449      80.525105
22329     2259.977324
Name: Nominal_Value, dtype: float64

###Bin each variable

In [None]:
vars=['KG_PIECE', 'Nominal_PRICE_PIECE', 'month', 'day', 'year']
#train & test
for var in vars:
  X_train[var+'_binned'], intervals = pd.qcut(
    X_train[var],
    q = 5,
    labels=False,
    retbins=True,
    precision=3,
    duplicates='drop',
  )
  X_test[var+'_binned'] = pd.cut(x = X_test[var], bins=intervals, labels=False)


In [None]:
# count the number of distinct bins in the training set
vars=['KG_PIECE_binned', 'Nominal_PRICE_PIECE_binned', 'month_binned', 'day_binned', 'year_binned']
for var in vars:
  print (var, X_train[var].nunique())

KG_PIECE_binned 5
Nominal_PRICE_PIECE_binned 5
month_binned 3
day_binned 5
year_binned 5


In [None]:
# count the number of distinct bins in the training set
vars=['KG_PIECE_binned', 'Nominal_PRICE_PIECE_binned', 'month_binned', 'day_binned', 'year_binned']
for var in vars:
  print (var, X_test[var].nunique())

KG_PIECE_binned 5
Nominal_PRICE_PIECE_binned 5
month_binned 3
day_binned 5
year_binned 5


In [None]:
#check NAs in train 
X_train.isnull().mean(), X_test.isnull().mean()

(KG_PIECE                      0.0
 Nominal_PRICE_PIECE           0.0
 month                         0.0
 day                           0.0
 year                          0.0
 KG_PIECE_binned               0.0
 Nominal_PRICE_PIECE_binned    0.0
 month_binned                  0.0
 day_binned                    0.0
 year_binned                   0.0
 dtype: float64, KG_PIECE                      0.000000
 Nominal_PRICE_PIECE           0.000000
 month                         0.000000
 day                           0.000000
 year                          0.000000
 KG_PIECE_binned               0.000000
 Nominal_PRICE_PIECE_binned    0.000006
 month_binned                  0.000829
 day_binned                    0.031843
 year_binned                   0.095464
 dtype: float64)

In [None]:
# The missing values in the test set appear when the original values are outside the boundaries of the invervals determined in the train set
# to speed out, I will just replace them by 0 in this notebook

X_test = X_test.fillna(0)

###Replace bins with target means for training and test sets

In [None]:
# now we use our previous function to encode the variables with the target mean

vars=['KG_PIECE_binned', 'Nominal_PRICE_PIECE_binned', 'month_binned', 'day_binned', 'year_binned']

X_train_enc, X_test_enc = mean_encoding(
    X_train[vars+['Nominal_Value']], X_test[vars+['Nominal_Value']], vars)

X_train_enc.head()

Unnamed: 0,KG_PIECE_binned,Nominal_PRICE_PIECE_binned,month_binned,day_binned,year_binned
206832,968.5276,691.520291,531.977186,482.046211,592.716712
317348,269.708702,287.180069,362.227775,456.607001,130.309432
21721,317.638338,179.922214,362.227775,463.586073,399.853761
431286,317.638338,179.922214,531.977186,486.78706,592.716712
386530,269.708702,287.180069,362.227775,456.607001,399.853761


###Determine R2 using encoded values

In [None]:
vars=['KG_PIECE_binned', 'Nominal_PRICE_PIECE_binned', 'month_binned', 'day_binned', 'year_binned']

r2a = []

for feature in vars:
    
    r2a.append(r2_score(y_test, X_test_enc[feature])) 

In [None]:
y_test.shape, X_test_enc.shape

((170022,), (170022, 5))

In [None]:
y_test.head(3)

Unnamed: 0,KG_PIECE,Nominal_PRICE_PIECE,Nominal_Value,month,day,year
253449,3.098337,17.676243,80.525105,8,26,2011
22329,2.645488,14.580499,2259.977324,8,16,2018
441577,4.191724,14.962759,7.481379,10,18,2018


In [None]:
# I make a series for easy visualisation

m1 = pd.Series(r2a)
m1.index = vars
m1.sort_values(ascending=False)

Nominal_PRICE_PIECE_binned    0.016895
KG_PIECE_binned               0.012437
year_binned                   0.004489
month_binned                  0.004005
day_binned                    0.000407
dtype: float64

In [None]:
X_test_enc.head(3)

Unnamed: 0,KG_PIECE_binned,Nominal_PRICE_PIECE_binned,month_binned,day_binned,year_binned
253449,269.708702,691.520291,531.977186,456.607001,592.716712
22329,968.5276,691.520291,531.977186,598.77663,399.853761
441577,269.708702,691.520291,767.681567,598.77663,399.853761
