In [176]:
# https://www.kaggle.com/lavanyashukla01/how-i-made-top-0-3-on-a-kaggle-competition
# Essentials
import pandas as pd
import numpy as np
import time
import sys
import math

# Plots
import seaborn as sns
import matplotlib.pyplot as plt

# Stats
from scipy.stats import skew, norm
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax

# Metrics
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler

# Models
from sklearn.preprocessing import LabelEncoder
from scipy.stats import pearsonr
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from mlxtend.classifier import StackingClassifier
from sklearn.model_selection import train_test_split

# Ignorar ciertos warnings
import warnings
warnings.filterwarnings(action="ignore")
pd.options.display.max_seq_items = 8000
pd.options.display.max_rows = 8000


from xgboost import XGBClassifier
from sklearn.svm import SVC

from sklearn import decomposition, datasets
from sklearn import tree
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

import math
from scipy.spatial import distance
from scipy.stats import ks_2samp
from scipy import spatial

In [177]:
import sklearn
sklearn.__version__

'0.22.1'

# Tarea de Minería

## Importing Data

In [178]:
# Data para entrenar el algoritmo de regresión
dataPesca = pd.read_csv("calaPesca.csv", sep=";")

dataPesca = dataPesca[["LONGI","LATIT","Salinidad","TC","Clorofila (mg/m3)","TSM (ºC)","Captura"]] #filtra solo lo que es igual al data frame "df"

dataPesca.columns = ["LONGI","LATIT","Salinidad","TC", "Clorofila","TSM","Captura" ]
dataPesca.head(3)

Unnamed: 0,LONGI,LATIT,Salinidad,TC,Clorofila,TSM,Captura
0,-78.12,-11.17,35.06,63,0.569,19.1,100
1,-78.18,-11.23,35.06,64,0.495,19.2,60
2,-78.38,-10.7,35.07,65,16.218,19.4,80


In [179]:
# Bucketización de la variable continua de pesca
dataPesca["CapturaBucketized"] = 999
dataPesca.loc[dataPesca.Captura <= 100, "CapturaBucketized"] = 1
dataPesca.loc[dataPesca.Captura > 100, "CapturaBucketized"] = 0

In [180]:
dataPesca.CapturaBucketized.value_counts()

1    6899
0    2630
Name: CapturaBucketized, dtype: int64

In [181]:
# Elimina outliers
dataPesca.drop(dataPesca[dataPesca['LONGI']>-40].index, inplace=True)
dataPesca.drop(dataPesca[dataPesca['TC']>85].index, inplace=True)
dataPesca.dropna(inplace=True)

## Modelamiento

In [182]:
#dataPesca = dataPesca.drop("CapturaBucketized", axis=1)

# Sacamos independientes y dependiente
Y = dataPesca["CapturaBucketized"]
X = dataPesca.drop("CapturaBucketized", axis=1)
X = X[['Salinidad', "TC", "Clorofila", "TSM"]]

# Dividimos en train, test en un 70 - 30
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.3, random_state=42)

In [183]:
X.describe()

Unnamed: 0,Salinidad,TC,Clorofila,TSM
count,9527.0,9527.0,9527.0,9527.0
mean,33.823038,57.97145,3.125849,17.854298
std,6.477817,15.811708,5.040272,3.676209
min,0.0,0.0,0.0,0.0
25%,35.01,51.0,0.275,17.3
50%,35.06,61.0,1.396,18.4
75%,35.11,69.0,3.802,19.5
max,35.38,75.0,56.234,23.9


In [184]:
def cv_acc(model, X=X_train, kf = 3):
    acc = np.sqrt(-cross_val_score(model, X_train, y_train_enc, scoring="accuracy", cv=kf))
    return (acc)

In [185]:
kf = KFold(n_splits=3, shuffle=True)

In [186]:
std_slc = StandardScaler()
pca = decomposition.PCA()
dec_tree = tree.DecisionTreeClassifier()

In [187]:
dec_tree

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

### Tuneo de hiperparámetros

In [188]:
pipe = Pipeline(steps=[('std_slc', std_slc),
                       ('dec_tree', dec_tree)])

#n_components = list(range(1,X_train.shape[1]+1,1))

In [189]:
params = {}#dict(pca__n_components=n_components)

In [190]:
params['dec_tree__criterion'] = ['gini', 'entropy']
params['dec_tree__splitter'] = ['best','random']
params['dec_tree__min_samples_split'] = [0.01, 0.05, 0.1, 0.15, 0.16, 0.18, 0.2]
params["dec_tree__max_features"] = ["auto", "sqrt", "log2"] 
params['dec_tree__max_depth'] = [2,4,6,8,9,10,11,12,20,30,40,50]

In [191]:
params

{'dec_tree__criterion': ['gini', 'entropy'],
 'dec_tree__splitter': ['best', 'random'],
 'dec_tree__min_samples_split': [0.01, 0.05, 0.1, 0.15, 0.16, 0.18, 0.2],
 'dec_tree__max_features': ['auto', 'sqrt', 'log2'],
 'dec_tree__max_depth': [2, 4, 6, 8, 9, 10, 11, 12, 20, 30, 40, 50]}

In [192]:
decisionTree = GridSearchCV(pipe, param_grid = params, cv = kf, scoring='accuracy') 
decisionTree.fit(X_train, Y_train)
decisionTree.best_params_

{'dec_tree__criterion': 'entropy',
 'dec_tree__max_depth': 20,
 'dec_tree__max_features': 'auto',
 'dec_tree__min_samples_split': 0.05,
 'dec_tree__splitter': 'random'}

In [193]:
decisionTree.best_score_

0.7215090604876951

### Set up the models

In [194]:
std_slc = StandardScaler()
dec_tree = tree.DecisionTreeClassifier(criterion='gini',max_depth=11, max_features = 'auto', min_samples_split = 0.01, splitter = 'random')

In [195]:
pipe = Pipeline(steps=[('std_slc', std_slc),
                       ('dec_tree', dec_tree)])

In [196]:
pipe.fit(X_train, Y_train)

Pipeline(memory=None,
         steps=[('std_slc',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('dec_tree',
                 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                        criterion='gini', max_depth=11,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1,
                                        min_samples_split=0.01,
                                        min_weight_fraction_leaf=0.0,
                                        presort='deprecated', random_state=None,
                                        splitter='random'))],
         verbose=False)

### Verifica los resultados del CV

In [197]:
from sklearn.metrics import accuracy_score
prd = pipe.predict(X_train)
accuracy_score(Y_train, prd)

0.7315536892621476

In [198]:
prd_test = pipe.predict(X_test)
accuracy_score(Y_test, prd_test)

0.7282266526757608

1    6898
0    2629

In [199]:
X_standarized = std_slc.transform(X)

In [200]:
Y.value_counts()

1    6898
0    2629
Name: CapturaBucketized, dtype: int64

In [201]:
one_distribution = X_standarized[Y[Y == 1].index]
one_distribution

array([[ 0.19101261,  0.31821189, -0.50654666,  0.33907689],
       [ 0.19101261,  0.3816374 , -0.52113881,  0.36625096],
       [ 0.192556  ,  0.44506291,  2.57929858,  0.42059912],
       ...,
       [ 0.19718618,  0.50848841,  1.8924816 ,  0.06733612],
       [ 0.18483903,  0.88904145, -0.48071461,  0.04016205],
       [ 0.19718618,  0.50848841,  1.8924816 ,  0.06733612]])

In [202]:
zero_distribution = X_standarized[Y[Y == 0].index[:-1]]
print(zero_distribution)


[[ 0.19409939  0.3816374   0.30351483  0.36625096]
 [ 0.19564279  0.3816374   0.9840217   0.39342504]
 [ 0.19564279  0.3816374   0.9840217   0.39342504]
 ...
 [ 0.20027297  0.95246696  1.8924816   0.71951396]
 [ 0.20181637  0.95246696  2.69169757  0.6651658 ]
 [ 0.20644655  1.01589247 -0.55860514  0.93690657]]


In [203]:
dfo = pd.DataFrame(one_distribution)
dfo.columns = ["Salinidad","TC", "Clorofila","TSM"]
dfo.describe()

Unnamed: 0,Salinidad,TC,Clorofila,TSM
count,6898.0,6898.0,6898.0,6898.0
mean,-0.040169,-0.096785,-0.014185,-0.043937
std,1.095034,1.052992,0.980216,1.081239
min,-5.220127,-3.677595,-0.618748,-4.851172
25%,0.181752,-0.50632,-0.618748,-0.177231
50%,0.191013,0.191361,-0.361808,0.09451
75%,0.19873,0.571914,0.130973,0.447773
max,0.238858,1.079318,10.093664,1.643433


In [204]:
dfz = pd.DataFrame(zero_distribution)
dfz.columns = ["Salinidad","TC", "Clorofila","TSM"]
dfz.describe()

Unnamed: 0,Salinidad,TC,Clorofila,TSM
count,2628.0,2628.0,2628.0,2628.0
mean,0.105732,0.251601,0.028919,0.117393
std,0.678991,0.80494,1.028593,0.728411
min,-5.220127,-3.677595,-0.618748,-4.851172
25%,0.183296,0.06451,-0.480715,-0.122882
50%,0.191013,0.254786,-0.302651,0.176032
75%,0.200273,0.968323,0.130973,0.502121
max,0.240401,1.079318,10.470102,1.616258


## Normal

In [286]:
import numpy as np

df_results = pd.DataFrame(columns=['Params','IL_EucDistance','IL_Cosimilitud','DR_JS','DR_KS'])
df_count = pd.DataFrame(columns=['One','Zero'])
j = 0

while (j < 10):
    
    #Salinidad', "TC", "Clorofila", "TSM
    Distillation_Salinidad = []
    Distillation_TC = []
    Distillation_Clorofila = []
    Distillation_TSM = []
    
    print ("j:",j)
    #class one
    Distillation_Salinidad.extend(np.random.normal(-0.040169 , 1.095034, 6898)) #Salinidad
    Distillation_TC.extend(np.random.normal(-0.096785 , 1.052992, 6898)) #TC
    Distillation_Clorofila.extend(np.random.normal(-0.014185 , 0.980216, 6898)) #Clorofila
    Distillation_TSM.extend(np.random.normal(-0.043937 , 1.081239, 6898)) #TSM

    #class zero
    Distillation_Salinidad.extend(np.random.normal( 0.105732, 0.678991, 2629)) #Salinidad
    Distillation_TC.extend(np.random.normal( 0.251601 , 0.804940, 2629)) #TC
    Distillation_Clorofila.extend(np.random.normal(0.028919  , 1.028593, 2629)) #Clorofila
    Distillation_TSM.extend(np.random.normal( 0.117393 , 0.728411 , 2629)) #TSM

    df_new = pd.DataFrame()
    df_new["Distillation_Salinidad"] = Distillation_Salinidad
    df_new["Distillation_TC"] = Distillation_TC
    df_new["Distillation_Clorofila"] = Distillation_Clorofila
    df_new["Distillation_TSM"] = Distillation_TSM

    df_new["PescaBucketizedNoisy"] = dec_tree.predict(df_new)

    #print (df_new.PescaBucketizedNoisy.value_counts())

    df_count = df_count.append({'One':int(df_new.PescaBucketizedNoisy.value_counts()[1]), 'Zero':int(df_new.PescaBucketizedNoisy.value_counts()[0])}, ignore_index=True)

    df_new["SALI"] = dataPesca["Salinidad"]
    df_new["TCL"] = dataPesca["TC"]
    df_new["CLO"] = dataPesca["Clorofila"]
    df_new["TSM"] = dataPesca["TSM"]

    import math

    # Get the Euclidean distance between vectors of real feature vs private vectors
    df_new["SquaredDifference"] = (df_new.Distillation_Salinidad - df_new.SALI)**2 + (df_new.Distillation_TC - df_new.TCL)**2 + (df_new.Distillation_Clorofila - df_new.CLO)**2 + (df_new.Distillation_TSM - df_new.TSM)**2
    df_new['EuclideanDistance'] = np.sqrt(df_new[['SquaredDifference']].sum(axis=1))

    # Cosimilitud
    r = []
    aux = 0
    for i in range(df_new.shape[0]):
        aux = spatial.distance.cosine(df_new.loc[i,["SALI","TCL","CLO","TSM"]], df_new.loc[i,["Distillation_Salinidad","Distillation_TC","Distillation_Clorofila","Distillation_TSM"]])
        if (math.isnan(aux)):
            aux = spatial.distance.cosine(df_new.loc[i,["SALI","TCL","CLO","TSM"]]+0.00000001, df_new.loc[i,["Distillation_Salinidad","Distillation_TC","Distillation_Clorofila","Distillation_TSM"]])
            if (math.isnan(aux)):
                aux = spatial.distance.cosine([0.00000001,0.00000001,0.00000001,0.00000001], df_new.loc[i,["Distillation_Salinidad","Distillation_TC","Distillation_Clorofila","Distillation_TSM"]])
                #print (df_new.loc[i,["SALI","TCL","CLO","TSM"]]+0.00000001)
                #print (df_new.loc[i,["Distillation_Salinidad","Distillation_TC","Distillation_Clorofila","Distillation_TSM"]])
                #print (aux)
        r.append(aux)

    # IL_EucDistance:
    IL_EucDistance = sum(df_new.EuclideanDistance)

    # IL_Cosimilitud:
    IL_Cosimilitud = sum(r)

    # DR Jensen Shannon: (1 - sum(abs(P_{verdadera}-V_{calculada})))/n 
    DR_JS = (1 - distance.jensenshannon(Y, df_new.PescaBucketizedNoisy))

    # DR Kolmogorov Smirnov
    # DR1: (1 - sum(P_{verdadera}-V_{calculada}))/n 
    DR_KS = (1 - ks_2samp(Y, df_new.PescaBucketizedNoisy)[0])

    # Resultados
    d = {'Params':["Distillation"],'IL_EucDistance': [IL_EucDistance], 'IL_Cosimilitud': [IL_Cosimilitud], 'DR_JS': [DR_JS], 'DR_KS':[DR_KS]}
    d = pd.DataFrame(data=d)

    df_results = df_results.append(d, ignore_index=True, sort=False)
    j += 1

j: 0
j: 1
j: 2
j: 3
j: 4
j: 5
j: 6
j: 7
j: 8
j: 9


In [289]:
df_count.describe()

Unnamed: 0,One,Zero
count,10,10
unique,10,10
top,8751,815
freq,1,1


In [288]:
df_results.describe()

Unnamed: 0,IL_EucDistance,IL_Cosimilitud,DR_JS,DR_KS
count,10.0,10.0,10.0,10.0
mean,667415.666317,9424.0539,0.638612,0.809793
std,72.009082,39.998243,0.002314,0.002796
min,667299.731514,9346.079306,0.636307,0.8055
25%,667384.272277,9401.868565,0.636791,0.808623
50%,667412.207468,9421.905254,0.637964,0.809541
75%,667473.311148,9462.677674,0.640357,0.811168
max,667509.741515,9467.555903,0.642955,0.814212


In [291]:
df_results.to_csv(r'normal_gini.csv', index = False)
df_count.to_csv(r'normal_gini_count.csv', index = False)


## Uniform

In [292]:
import numpy as np

df_results = pd.DataFrame(columns=['Params','IL_EucDistance','IL_Cosimilitud','DR_JS','DR_KS'])
df_count = pd.DataFrame(columns=['One','Zero'])
j = 0

while (j < 10):
    
    #Salinidad', "TC", "Clorofila", "TSM
    Distillation_Salinidad = []
    Distillation_TC = []
    Distillation_Clorofila = []
    Distillation_TSM = []
    
    print ("j:",j)
    #class one
    Distillation_Salinidad.extend(np.random.uniform(-5.220127 , 0.238858, 6898)) #Salinidad
    Distillation_TC.extend(np.random.uniform(-3.677595 , 1.079318, 6898)) #TC
    Distillation_Clorofila.extend(np.random.uniform(-0.618748 , 10.093664, 6898)) #Clorofila
    Distillation_TSM.extend(np.random.uniform(-4.851172 , 1.643433, 6898)) #TSM

    #class zero
    Distillation_Salinidad.extend(np.random.uniform(-5.220127 , 0.240401, 2629)) #Salinidad
    Distillation_TC.extend(np.random.uniform(-3.677595 , 1.079318, 2629)) #TC
    Distillation_Clorofila.extend(np.random.uniform(-0.618748 , 10.470102, 2629)) #Clorofila
    Distillation_TSM.extend(np.random.uniform(-4.851172 , 1.616258, 2629)) #TSM

    df_new = pd.DataFrame()
    df_new["Distillation_Salinidad"] = Distillation_Salinidad
    df_new["Distillation_TC"] = Distillation_TC
    df_new["Distillation_Clorofila"] = Distillation_Clorofila
    df_new["Distillation_TSM"] = Distillation_TSM

    df_new["PescaBucketizedNoisy"] = dec_tree.predict(df_new)

    #print (df_new.PescaBucketizedNoisy.value_counts())

    df_count = df_count.append({'One':int(df_new.PescaBucketizedNoisy.value_counts()[1]), 'Zero':int(df_new.PescaBucketizedNoisy.value_counts()[0])}, ignore_index=True)

    df_new["SALI"] = dataPesca["Salinidad"]
    df_new["TCL"] = dataPesca["TC"]
    df_new["CLO"] = dataPesca["Clorofila"]
    df_new["TSM"] = dataPesca["TSM"]

    import math

    # Get the Euclidean distance between vectors of real feature vs private vectors
    df_new["SquaredDifference"] = (df_new.Distillation_Salinidad - df_new.SALI)**2 + (df_new.Distillation_TC - df_new.TCL)**2 + (df_new.Distillation_Clorofila - df_new.CLO)**2 + (df_new.Distillation_TSM - df_new.TSM)**2
    df_new['EuclideanDistance'] = np.sqrt(df_new[['SquaredDifference']].sum(axis=1))

    # Cosimilitud
    r = []
    aux = 0
    for i in range(df_new.shape[0]):
        aux = spatial.distance.cosine(df_new.loc[i,["SALI","TCL","CLO","TSM"]], df_new.loc[i,["Distillation_Salinidad","Distillation_TC","Distillation_Clorofila","Distillation_TSM"]])
        if (math.isnan(aux)):
            aux = spatial.distance.cosine(df_new.loc[i,["SALI","TCL","CLO","TSM"]]+0.00000001, df_new.loc[i,["Distillation_Salinidad","Distillation_TC","Distillation_Clorofila","Distillation_TSM"]])
            if (math.isnan(aux)):
                aux = spatial.distance.cosine([0.00000001,0.00000001,0.00000001,0.00000001], df_new.loc[i,["Distillation_Salinidad","Distillation_TC","Distillation_Clorofila","Distillation_TSM"]])
                #print (df_new.loc[i,["SALI","TCL","CLO","TSM"]]+0.00000001)
                #print (df_new.loc[i,["Distillation_Salinidad","Distillation_TC","Distillation_Clorofila","Distillation_TSM"]])
                #print (aux)
        r.append(aux)

    # IL_EucDistance:
    IL_EucDistance = sum(df_new.EuclideanDistance)

    # IL_Cosimilitud:
    IL_Cosimilitud = sum(r)

    # DR Jensen Shannon: (1 - sum(abs(P_{verdadera}-V_{calculada})))/n 
    DR_JS = (1 - distance.jensenshannon(Y, df_new.PescaBucketizedNoisy))

    # DR Kolmogorov Smirnov
    # DR1: (1 - sum(P_{verdadera}-V_{calculada}))/n 
    DR_KS = (1 - ks_2samp(Y, df_new.PescaBucketizedNoisy)[0])

    # Resultados
    d = {'Params':["Distillation"],'IL_EucDistance': [IL_EucDistance], 'IL_Cosimilitud': [IL_Cosimilitud], 'DR_JS': [DR_JS], 'DR_KS':[DR_KS]}
    d = pd.DataFrame(data=d)

    df_results = df_results.append(d, ignore_index=True, sort=False)
    j += 1

j: 0
j: 1
j: 2
j: 3
j: 4
j: 5
j: 6
j: 7
j: 8
j: 9


In [293]:
df_results.to_csv(r'uniform_gini.csv', index = False)
df_count.to_csv(r'uniform_gini_count.csv', index = False)


In [294]:
df_results

Unnamed: 0,Params,IL_EucDistance,IL_Cosimilitud,DR_JS,DR_KS
0,Distillation,694749.704119,13291.121611,0.666995,0.738743
1,Distillation,694400.696853,13240.463828,0.666931,0.741052
2,Distillation,694614.329946,13266.947582,0.666247,0.740422
3,Distillation,694276.786481,13249.673618,0.66658,0.738952
4,Distillation,694659.808371,13233.086162,0.667646,0.741472
5,Distillation,694359.829226,13225.866749,0.665043,0.740317
6,Distillation,694357.944009,13256.639353,0.666732,0.740632
7,Distillation,694506.358942,13221.305247,0.66689,0.740422
8,Distillation,694746.83914,13319.539684,0.667065,0.739162
9,Distillation,694367.0754,13256.385535,0.666171,0.741891
