# Empresa Pesquera

## Librerías

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import itertools

from scipy.optimize import minimize
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection  import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
import lightgbm as lgb

## Importar data

In [None]:
## data 
import warnings
warnings.filterwarnings('ignore')
dataset = pd.read_csv('TRAIN_MEDIAN.csv', sep=";",encoding = "ISO-8859-1")
dataset.shape

In [None]:
dataset.head()

In [None]:
dataset.planta.unique()

In [None]:
dataset.columns

In [None]:
len(dataset.ruma.drop_duplicates())

In [None]:
dataset["suma"] = dataset.drop(['planta','ruma','proteina','tvn'], axis=1).sum(axis=1)
dataset["mediana"] = dataset.drop(['planta','ruma','proteina','tvn'], axis=1).median(axis=1)
dataset["media"] = dataset.drop(['planta','ruma','proteina','tvn'], axis=1).mean(axis=1)
dataset["std"] = dataset.drop(['planta','ruma','proteina','tvn'], axis=1).std(axis=1)
dataset.head()

## Preprocesing

In [None]:
## selecionar las variables de entrenamiento
target = 'proteina'
exclude = ['tvn','ruma']
cols = [x for x in dataset.columns if x not in exclude + [target]]
cols_cat = dataset[cols].select_dtypes(['object']).columns.tolist()
index_categorical=[cols.index(x) for x in cols_cat]
print(index_categorical,cols_cat)

In [None]:
for i in cols_cat:
    le = preprocessing.LabelEncoder()
    le.fit(list(dataset[i].dropna()))
    dataset.loc[~dataset[i].isnull(),i]=le.transform(dataset[i].dropna())

dataset[cols].head()

## Dataset X and Y

In [None]:
X_p,y_p = dataset.drop(['proteina','tvn'], axis=1),dataset[['proteina']]
X_t,y_t = dataset.drop(['proteina','tvn'], axis=1),dataset[['tvn']]

In [None]:
X_p.head()

## Análisis de Componentes Principales(PCA)

In [None]:
import numpy as np
from sklearn.decomposition import PCA
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale
%matplotlib inline

In [None]:
comp = X_p[cols].drop('planta', axis=1)
#Scaling the values
X = scale(comp)

pca = PCA(n_components=20)

pca.fit(X)

In [None]:
var= pca.explained_variance_ratio_
var1=np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)
print(var1)

In [None]:
plt.plot(var1)

In [None]:
comp = X_p[cols].drop('planta', axis=1)

In [None]:
comp.head(1)

In [None]:
X = scale(comp)

pca = PCA(n_components=250)
pca.fit(X)
X1=pca.fit_transform(X)

X1.shape

In [None]:
componentes = pd.DataFrame(data = X1)
componentes.head()

In [None]:
dataset.head(1)

In [None]:
dataset_p = pd.concat([componentes, dataset[['planta','ruma','proteina']]], axis = 1)
dataset_p

## Para proteina

In [None]:
#X_p.drop(['ruma'], axis = 1)
#y_p

### NO usamos PCA

In [None]:
X_train, X_test, y_train, y_test= train_test_split(X_p.drop(['ruma'], axis = 1),y_p,test_size=0.20, random_state=1416)

### SI usamos PCA

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test= train_test_split(dataset_p.drop(['ruma','proteina'], axis = 1),dataset_p['proteina'],test_size=0.20, random_state=1416)



In [None]:
X_train.head(1)

In [None]:
X_train.shape

In [None]:
cols_cat

### Entrenamiento de un modelo

In [None]:

lgb_train = lgb.Dataset(X_train, y_train.values.ravel())
lgb_eval = lgb.Dataset(X_test,y_test.values.ravel(), reference=lgb_train)

params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': {'mae'},
        "bagging_seed":5,
        'learning_rate': 0.01, # 
         "max_depth" : 3,
         'reg_sqrt':True,
        'feature_fraction': 0.5,
        'bagging_fraction': 0.8,
        'verbose': 10
    
}
n_estimators=1000
lgbm3 = lgb.train(params, 
                  lgb_train, 
                  n_estimators,
                  early_stopping_rounds=50,
                  valid_sets=[lgb_train,lgb_eval],
                   categorical_feature=cols_cat,
                  verbose_eval =  10
                 )
