In [1]:
# https://www.lfd.uci.edu/~gohlke/pythonlibs/
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor as RFR

# foucus on PM2.5

fs = pd.read_csv('E:/working/DataMining/fangshan.csv')
fs = fs.drop(['weather', 'station_id', 'longitude', 'latitude', 'PM10', 'NO2', 'CO', 'O3', 'SO2'], axis=1)

def feature_vector(df, feature, N):
    rows = df.shape[0]
    column_n = [None] * N + [df[feature][i - N] for i in range(N, rows)]
    column_name = "{}_{}".format(feature, N)
    df[column_name] = column_n

fs = fs.drop(['utc_time',], axis=1)

for feature in ['PM2.5', 'temperature', 'pressure', 'humidity', 'wind_direction', 'wind_speed']:
    for N in range(1, 4):
        feature_vector(fs, feature, N)
        
fs = fs.drop(['temperature', 'pressure', 'humidity', 'wind_direction', 'wind_speed'], axis=1)
fs = fs[3:].reset_index(drop=True)

fs.head()

Unnamed: 0,PM2.5,PM2.5_1,PM2.5_2,PM2.5_3,temperature_1,temperature_2,temperature_3,pressure_1,pressure_2,pressure_3,humidity_1,humidity_2,humidity_3,wind_direction_1,wind_direction_2,wind_direction_3,wind_speed_1,wind_speed_2,wind_speed_3
0,118.0,117.0,116.0,114.0,-2.8,-2.7,-2.6,1025.6,1025.8,1026.1,17.0,15.0,14.0,228.0,217.0,215.0,3.2,3.5,3.3
1,118.0,118.0,117.0,116.0,-2.8,-2.8,-2.7,1025.0,1025.6,1025.8,18.0,17.0,15.0,228.0,228.0,217.0,1.8,3.2,3.5
2,106.0,118.0,118.0,117.0,-3.1,-2.8,-2.8,1024.1,1025.0,1025.6,17.0,18.0,17.0,209.0,228.0,228.0,2.4,1.8,3.2
3,110.0,106.0,118.0,118.0,-3.9,-3.1,-2.8,1023.9,1024.1,1025.0,19.0,17.0,18.0,216.0,209.0,228.0,1.7,2.4,1.8
4,122.0,110.0,106.0,118.0,-4.7,-3.9,-3.1,1023.2,1023.9,1024.1,23.0,19.0,17.0,233.0,216.0,209.0,0.4,1.7,2.4


In [3]:
def score(estimator, X, y):
    y_prediction = estimator.predict(X)
    return np.sum(np.abs(y_prediction - y) / (np.abs(y_prediction) + np.abs(y))) / y.shape[0] 

from sklearn.cross_validation import train_test_split
from sklearn.model_selection import GridSearchCV

features = ['PM2.5_1', 'PM2.5_2', 'PM2.5_3',
              'temperature_1', 'temperature_2', 'temperature_3',
              'pressure_1', 'pressure_2', 'pressure_3',
              'humidity_1','humidity_2', 'humidity_3',
              'wind_direction_1', 'wind_direction_2', 'wind_direction_3',
              'wind_speed_1', 'wind_speed_2', 'wind_speed_3']

y = np.array(fs['PM2.5'])
X = np.array(fs[features])

RF_param={'n_estimators':[15, 20, 25], 'max_features':['auto', 'sqrt', 'log2', None],'min_samples_split':[2,4,6]}
grid = GridSearchCV( estimator = RFR(n_jobs=4), param_grid = RF_param, scoring = score, cv=5)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
grid.fit(X_train, y_train)

print(grid.best_score_)
print(score(grid, X_test, y_test))

0.118269739414
0.123622977488


In [5]:
features = ['PM2.5_1', 'PM2.5_2', 'PM2.5_3']
X = np.array(fs[features])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
grid.fit(X_train, y_train)

print(grid.best_score_)
print(score(grid, X_test, y_test))

0.125701378078
0.13527708128


In [6]:
from xgboost import XGBRegressor as XGBR
XGBR_param={'max_depth':[2,3,4], 'learning_rate':[0.05, 0.1, 0.15], 'n_estimators':[100, 150, 200], 
            'reg_alpha':[0, 2, 4], 'reg_lambda':[1, 3, 5]}
grid = GridSearchCV( estimator = XGBR(n_jobs=8, objective='reg:gamma'), param_grid = XGBR_param, scoring = score, cv=5)
grid.fit(X_train, y_train)

print(grid.best_score_)
print(score(grid, X_test, y_test))

0.26840021065
0.271548520875
