In [1]:
import pandas as pd
import numpy as np
import kaggle
import zipfile
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from tqdm import tqdm

import warnings
warnings.simplefilter('ignore')



# 1. Identificación del Problema

Predecir la radiación solar basandose en la velocidad y dirección del viento, humedad y temperatura con  4 meses de datos.

# 2. Búsqueda de datos

Los datos se entregan en las bases de la competencia y mediante el API que será el medio utilizado

https://www.kaggle.com/c/datamex0120

API: kaggle competitions download -c datamex0120

In [2]:
kaggle.api.competitions_data_list_files('datamex0120')

[{'ref': 'solar_test.csv',
  'description': None,
  'name': 'solar_test.csv',
  'totalBytes': 726642,
  'url': 'https://www.kaggle.com/',
  'creationDate': '2020-02-11T22:28:05.6358912Z'},
 {'ref': 'submission_example..csv',
  'description': None,
  'name': 'submission_example..csv',
  'totalBytes': 187689,
  'url': 'https://www.kaggle.com/',
  'creationDate': '2020-02-11T22:28:05.6358912Z'},
 {'ref': 'solar_train.csv',
  'description': None,
  'name': 'solar_train.csv',
  'totalBytes': 2201144,
  'url': 'https://www.kaggle.com/',
  'creationDate': '2020-02-11T22:28:05.6358912Z'}]

In [3]:
#Descarga de Archivos
kaggle.api.competition_download_files('datamex0120')

In [4]:
#Carga de Dataframes
zf=zipfile.ZipFile('datamex0120.zip')

solar_train=pd.read_csv(zf.open('solar_train.csv'))
solar_test=pd.read_csv(zf.open('solar_test.csv'))
sub_ex=pd.read_csv(zf.open('submission_example..csv'))


In [5]:
#Carga de Dataframes
zf=zipfile.ZipFile('datamex0120.zip')

solar_train=pd.read_csv(zf.open('solar_train.csv'))
solar_test=pd.read_csv(zf.open('solar_test.csv'))
sub_ex=pd.read_csv(zf.open('submission_example..csv'))

# 3. Limpieza Dataset

## 3.1. Exploración de datos

In [6]:
solar_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24514 entries, 0 to 24513
Data columns (total 11 columns):
UNIXTime                  24514 non-null int64
Data                      24514 non-null object
Time                      24514 non-null object
Radiation                 24514 non-null float64
Temperature               24514 non-null int64
Pressure                  24514 non-null float64
Humidity                  24514 non-null int64
WindDirection(Degrees)    24514 non-null float64
Speed                     24514 non-null float64
TimeSunRise               24514 non-null object
TimeSunSet                24514 non-null object
dtypes: float64(4), int64(3), object(4)
memory usage: 2.1+ MB


In [7]:
solar_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8172 entries, 0 to 8171
Data columns (total 11 columns):
id                        8172 non-null int64
UNIXTime                  8172 non-null int64
Data                      8172 non-null object
Time                      8172 non-null object
Temperature               8172 non-null int64
Pressure                  8172 non-null float64
Humidity                  8172 non-null int64
WindDirection(Degrees)    8172 non-null float64
Speed                     8172 non-null float64
TimeSunRise               8172 non-null object
TimeSunSet                8172 non-null object
dtypes: float64(3), int64(4), object(4)
memory usage: 702.4+ KB


In [8]:
sub_ex.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8172 entries, 0 to 8171
Data columns (total 2 columns):
id           8172 non-null int64
Radiation    8172 non-null float64
dtypes: float64(1), int64(1)
memory usage: 127.8 KB


In [9]:
#Orden por tiempo, eliminacion de Fecha, hora, salida y puesta del sol, ya que no son relevantes o repetitivas
#Lo anterior es basado en el "conocimiento" del problema, ya que la fecha y hora esta dada proporcionalmente
#en la variable UNIXTime, y la puesta y salida del sol es una medida en relación a la observacion desde la tierra
#lo que se considera despreciable en cuanto a lo que pueda influir en este tema.

solar_train=solar_train.sort_values(by='UNIXTime')
solar_train.drop(columns=['Data','Time','TimeSunRise','TimeSunSet'], inplace=True)
solar_train.reset_index(inplace=True)
solar_train.head()

Unnamed: 0,index,UNIXTime,Radiation,Temperature,Pressure,Humidity,WindDirection(Degrees),Speed
0,1673,1472724008,2.58,51,30.43,103,77.27,11.25
1,19219,1472724310,2.83,51,30.43,103,153.44,9.0
2,5516,1472725505,2.21,51,30.43,103,144.12,18.0
3,12019,1472725809,2.25,51,30.43,103,67.42,11.25
4,7888,1472726704,2.15,51,30.43,103,67.85,4.5


In [10]:
#Hologación de columnas con respecto al conjunto de datos "train". Se mantiene el ID para identificación
solar_test.drop(columns=['Data','Time','TimeSunRise','TimeSunSet'], inplace=True)
solar_train.head()

Unnamed: 0,index,UNIXTime,Radiation,Temperature,Pressure,Humidity,WindDirection(Degrees),Speed
0,1673,1472724008,2.58,51,30.43,103,77.27,11.25
1,19219,1472724310,2.83,51,30.43,103,153.44,9.0
2,5516,1472725505,2.21,51,30.43,103,144.12,18.0
3,12019,1472725809,2.25,51,30.43,103,67.42,11.25
4,7888,1472726704,2.15,51,30.43,103,67.85,4.5


In [11]:
#Normalizacion del tiempo
#El tiempo se normaliza usando el màximo y el mìnimo y restando el mínimo para tener un tiempo relativo de origen
tmin_train=solar_train.UNIXTime.min()
tmin_test=solar_test.UNIXTime.min()
tmin=min(tmin_train,tmin_test)
print(tmin)

1472724008


In [12]:
solar_train.UNIXTime = solar_train.UNIXTime - tmin
solar_test.UNIXTime = solar_test.UNIXTime - tmin

In [13]:
solar_test.head()

Unnamed: 0,id,UNIXTime,Temperature,Pressure,Humidity,WindDirection(Degrees),Speed
0,0,5996099,59,30.47,44,312.67,3.37
1,1,1339495,59,30.48,83,38.01,6.75
2,2,3385213,47,30.39,78,213.62,5.62
3,3,8751048,45,30.4,98,176.63,4.5
4,4,4769109,45,30.4,34,175.89,6.75


In [14]:
solar_train.head()

Unnamed: 0,index,UNIXTime,Radiation,Temperature,Pressure,Humidity,WindDirection(Degrees),Speed
0,1673,0,2.58,51,30.43,103,77.27,11.25
1,19219,302,2.83,51,30.43,103,153.44,9.0
2,5516,1497,2.21,51,30.43,103,144.12,18.0
3,12019,1801,2.25,51,30.43,103,67.42,11.25
4,7888,2696,2.15,51,30.43,103,67.85,4.5


In [15]:
solar_train.describe()

Unnamed: 0,index,UNIXTime,Radiation,Temperature,Pressure,Humidity,WindDirection(Degrees),Speed
count,24514.0,24514.0,24514.0,24514.0,24514.0,24514.0,24514.0,24514.0
mean,12256.5,5324515.0,208.062687,51.105287,30.422713,75.002284,143.129305,6.248435
std,7076.726586,3006914.0,316.133238,6.21428,0.054754,25.999994,82.939584,3.482597
min,0.0,0.0,1.13,34.0,30.19,11.0,0.09,0.0
25%,6128.25,2808461.0,1.23,46.0,30.4,55.0,81.8725,3.37
50%,12256.5,5323360.0,2.72,50.0,30.43,85.0,147.585,5.62
75%,18384.75,7757619.0,359.9275,55.0,30.46,97.0,179.2175,7.87
max,24513.0,10540490.0,1601.26,71.0,30.56,103.0,359.95,39.37


In [16]:
#Eliminacion de Outliers

IQR_Speed=solar_train.Speed.quantile(.75)-solar_train.Speed.quantile(.25)
solar_train=solar_train[(solar_train.Speed <solar_train.Speed.mean()+
                         1.5*IQR_Speed)&(solar_train.Speed >solar_train.Speed.mean()-1.5*IQR_Speed)]
IQR_Pressure=solar_train.Pressure.quantile(.75)-solar_train.Pressure.quantile(.25)
solar_train=solar_train[(solar_train.Pressure <solar_train.Pressure.mean()+
                         1.5*IQR_Pressure)&(solar_train.Pressure >solar_train.Pressure.mean()-1.5*IQR_Pressure)]
IQR_Temperature=solar_train.Temperature.quantile(.75)-solar_train.Temperature.quantile(.25)
solar_train=solar_train[(solar_train.Temperature <solar_train.Temperature.mean()+
                         1.5*IQR_Temperature)&(solar_train.Temperature >solar_train.Temperature.mean()-1.5*IQR_Temperature)]

solar_train

Unnamed: 0,index,UNIXTime,Radiation,Temperature,Pressure,Humidity,WindDirection(Degrees),Speed
0,1673,0,2.58,51,30.43,103,77.27,11.25
1,19219,302,2.83,51,30.43,103,153.44,9.00
3,12019,1801,2.25,51,30.43,103,67.42,11.25
4,7888,2696,2.15,51,30.43,103,67.85,4.50
5,11281,2998,2.07,51,30.43,103,156.58,1.12
...,...,...,...,...,...,...,...,...
24509,1041,10539294,1.22,41,30.34,83,238.94,6.75
24510,5169,10539593,1.21,41,30.34,82,236.79,5.62
24511,8494,10539896,1.21,42,30.34,81,218.28,7.87
24512,13576,10540195,1.19,41,30.34,80,215.23,7.87


In [17]:
#Optimizando tipo de datos
solar_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21389 entries, 0 to 24513
Data columns (total 8 columns):
index                     21389 non-null int64
UNIXTime                  21389 non-null int64
Radiation                 21389 non-null float64
Temperature               21389 non-null int64
Pressure                  21389 non-null float64
Humidity                  21389 non-null int64
WindDirection(Degrees)    21389 non-null float64
Speed                     21389 non-null float64
dtypes: float64(4), int64(4)
memory usage: 1.5 MB


In [18]:
for e in solar_train.select_dtypes('integer').columns:
    solar_train[e]=pd.to_numeric(solar_train[e], downcast='integer')
for e in solar_train.select_dtypes('float').columns:
    solar_train[e]=pd.to_numeric(solar_train[e], downcast='float')
solar_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21389 entries, 0 to 24513
Data columns (total 8 columns):
index                     21389 non-null int16
UNIXTime                  21389 non-null int32
Radiation                 21389 non-null float32
Temperature               21389 non-null int8
Pressure                  21389 non-null float32
Humidity                  21389 non-null int8
WindDirection(Degrees)    21389 non-null float32
Speed                     21389 non-null float32
dtypes: float32(4), int16(1), int32(1), int8(2)
memory usage: 668.4 KB


In [19]:
#Preparando conjuntos para entrenamiento y prueba
X=solar_train.drop(columns='Radiation', axis=1)
y=solar_train.Radiation

#Conjuntos para prueba
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2)


# 4. Definición de modelo

## 4.1. Ejecución de modelos

In [21]:
#Modelo de Regresión Lineal
from sklearn.linear_model import LinearRegression as Linreg

linreg = Linreg()
linreg.fit(X_train, y_train)
y_pred=linreg.predict(X_test)
#Test
train_score=linreg.score(X_train, y_train)
test_score=linreg.score(X_test, y_test)
print (train_score, test_score)

0.5977528570658116 0.5855599720258595


In [22]:
#Modelo de Lasso
from sklearn.linear_model import Lasso

lasso=Lasso()
lasso.fit(X_train, y_train)

#Test
train_score=lasso.score(X_train, y_train)
test_score=lasso.score(X_test, y_test)
print (train_score, test_score)

0.5967235441604666 0.5834323244079197


In [26]:
#Modelo de Ridge
from sklearn.linear_model import Ridge

ridge=Ridge()
ridge.fit(X_train, y_train)
train_score=ridge.score(X_train, y_train)
test_score=ridge.score(X_test, y_test)
print (train_score, test_score)

0.5977505583158635 0.5855030501454649


In [27]:
#Modelo de ElasticNet
from sklearn.linear_model import ElasticNet

ridge=Ridge()
elastic=ElasticNet()
elastic.fit(X_train, y_train)
train_score=elastic.score(X_train, y_train)
test_score=elastic.score(X_test, y_test)
print (train_score, test_score)

0.5964752642192956 0.5835579532861668


In [28]:
#Model SVR
from sklearn.svm import SVR

svr=SVR()
svr.fit(X_train, y_train)
train_score=svr.score(X_train, y_train)
test_score=svr.score(X_test, y_test)
print (train_score, test_score)

-0.42070725439706314 -0.4018956649454939


In [29]:
#Modelo Random Forest
from sklearn.ensemble import RandomForestRegressor as RFR

rf=RFR()
rf.fit(X_train, y_train)
train_score=rf.score(X_train, y_train)
test_score=rf.score(X_test, y_test)
print (train_score, test_score)

0.9828099714437195 0.8861968033480797


In [30]:
from sklearn.model_selection import GridSearchCV
def grid(modelo, param, cv=5):
    grid=GridSearchCV(modelo, param, cv=cv, 
                      iid=True, return_train_score=True, n_jobs=-1)
    
    grid.fit(X_train, y_train)
    
    print('Acierto test: {:.2f}'.format(grid.score(X_test, y_test)))
    print('Acierto train: {:.2f}'.format(grid.score(X_train, y_train)))
    print('Mejores parametros: {}'.format(grid.best_params_))
    print('Mejor acierto cv: {:.2f}'.format(grid.best_score_))
    
    return grid.best_estimator_.fit(X_train, y_train)

In [31]:
param={'max_leaf_nodes':[5,10,20], 'n_estimators':[10,50,100,500]}
grid(rf, param)

Acierto test: 0.69
Acierto train: 0.71
Mejores parametros: {'max_leaf_nodes': 20, 'n_estimators': 100}
Mejor acierto cv: 0.70


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=20,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [32]:
#Modelo SGDR
from sklearn.linear_model import SGDRegressor as SGDR

sgdr=SGDR(max_iter=100000)
sgdr.fit(X_train, y_train)
train_score=sgdr.score(X_train, y_train)
test_score=sgdr.score(X_test, y_test)
print (train_score, test_score)

-8.73058550732199e+37 -8.791153413166697e+37


In [33]:
# GBR
from sklearn.ensemble import GradientBoostingRegressor as GBR

gbr=GBR()
gbr.fit(X_train, y_train)

train_score=gbr.score(X_train, y_train)
test_score=gbr.score(X_test, y_test)

print (train_score, test_score)

0.7778786644190365 0.7627177170709625


In [34]:
# XGBR
from xgboost import XGBRegressor as XGBR

xgbr=XGBR()
xgbr.fit(X_train, y_train)

train_score=xgbr.score(X_train, y_train)
test_score=xgbr.score(X_test, y_test)

print (train_score, test_score)

0.7775393213377497 0.7653262412596309


In [35]:
# CTR
from catboost import CatBoostRegressor as CTR

ctr=CTR(verbose=False)
ctr.fit(X_train, y_train)

train_score=ctr.score(X_train, y_train)
test_score=ctr.score(X_test, y_test)

print (train_score, test_score)

0.9127001007360173 0.8747842127784995


In [36]:
#LightGBM
from lightgbm import LGBMRegressor as LGBMR

lgbmr=LGBMR()
lgbmr.fit(X_train, y_train)

train_score=lgbmr.score(X_train, y_train)
test_score=lgbmr.score(X_test, y_test)

print (train_score, test_score)

0.8952493243115632 0.8682740780763015


## 4.2. Selección de combinación de modelos

In [37]:
#Stacking de los mejores metodos:
#Combinacion de todos los métodos con muestras pequeñas. Regresa la mejor combinación de métodos

from mlxtend.regressor import StackingRegressor

sample_solar_train = solar_train.sample(n=1000, random_state=1)

Xs=sample_solar_train.drop(columns='Radiation', axis=1)
ys=sample_solar_train.Radiation

#Conjuntos para prueba
Xs_train, Xs_test, ys_train, ys_test = train_test_split(Xs, ys, test_size=0.2)

# 5. Revisión de Funcionalidad

In [38]:
#Preparando conjuntos para entrenamiento y prueba

X=solar_train.drop(columns='Radiation', axis=1)
y=solar_train.Radiation

#Reentrenar con todos los datos
X_train = X
y_train = y

#Modelo Random Forest
rf=RFR()
rf.fit(X_train, y_train)
train_score=rf.score(X_train, y_train)
print (train_score)

0.9852836001826769


In [39]:
X_predict=solar_test

In [40]:
y_predict=rf.predict(X_predict)

In [41]:
y_predict=pd.DataFrame(y_predict)
y_predict['id']=y_predict.index

In [42]:
y_predict.columns = ['Radiation','id']

In [43]:
y_predict=y_predict[['id','Radiation']]
y_predict

Unnamed: 0,id,Radiation
0,0,700.253414
1,1,860.100998
2,2,4.201000
3,3,10.780200
4,4,1.357200
...,...,...
8167,8167,358.438701
8168,8168,1.237900
8169,8169,8.196900
8170,8170,2.797000


## Parámetros del modelo para Summition en Kaggel

In [44]:
y_predict.to_csv('submission.csv', index=False)

In [45]:
#kaggle.api.authenticate()

In [46]:
#kaggle.api.competition_submit('submission.csv','First try  - Alberto Ibarra','data0120')

#kaggle competitions submit -c datamex0120 -f submission.csv -m "Message"