import cPickle import os import sys import scipy as sc import operator import numpy as np import pandas as pd from scipy import sparse import xgboost as xgb from sklearn import model_selection, preprocessing, ensemble from sklearn.metrics import log_loss from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from bs4 import BeautifulSoup #reload(sys) #sys.setdefaultencoding('utf8') #r = re.compile(r"\s") from sklearn.preprocessing import LabelEncoder import re from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.decomposition import TruncatedSVD import nltk from scipy.stats import boxcox from sklearn.decomposition import TruncatedSVD import datetime as dt from nltk.stem.porter import * import gc import math from collections import Counter nfold = 5 nbag = 10 from keras.utils import np_utils from keras.models import Sequential from keras.layers import Dense, Dropout, Activation from keras.layers.advanced_activations import PReLU from keras.callbacks import ModelCheckpoint, EarlyStopping from keras.layers.recurrent import LSTM from keras.layers.normalization import BatchNormalization from keras.layers.core import Dense, Dropout, Activation, Reshape from keras.layers.embeddings import Embedding def nn_model4(): model = Sequential() model.add(Dense(100, input_dim = train_X2.shape[1], init = 'uniform'))#500 model.add(PReLU()) model.add(BatchNormalization()) model.add(Dropout(0.3))#.2 model.add(Dense(100, init = 'uniform'))#400 model.add(PReLU()) model.add(BatchNormalization()) model.add(Dropout(0.3))#.2 model.add(Dense(3, init='zero')) model.add(Activation('softmax'))## model.compile(loss = 'categorical_crossentropy', optimizer = 'adam') return(model) test_df2 = pd.read_csv("../data.csv").as_matrix() # an np Array of 40K rows and 75 Columns train_df2 = pd.read_csv("../data2.csv").as_matrix() # an np Array of 70K rows and 75 Columns train_y = pd.read_csv("../data.csv").as_matrix() # an np Array of 40K rows and 1 Column having values 0,1,2 cv_scores = [] oob_valpred = np.zeros((train_df2.shape[0],3)) oob_tstpred = np.zeros((test_df2.shape[0],3)) train_y2 = np_utils.to_categorical(train_y, 3) for x in np.arange(nbag): kf = model_selection.KFold(n_splits=nfold, shuffle=True, random_state=12345*x) for dev_index, val_index in kf.split(range(train_y.shape[0])): train_X2, val_X2 = train_df2[dev_index,:], train_df2[val_index,:] dev_y, val_y = train_y2[dev_index], train_y2[val_index] test_X2 = test_df2.copy() print(train_X2.shape) model = nn_model4() earlyStopping=EarlyStopping(monitor='val_loss', patience=50, verbose=1, mode='auto') checkpointer = ModelCheckpoint(filepath="./weights2XXLK.hdf5", verbose=1, save_best_only=True) fit = model.fit(train_X2, dev_y, nb_epoch = 10000, validation_data=(val_X2, val_y), verbose = 1,callbacks=[earlyStopping,checkpointer] ) print("loading weights") model.load_weights("./weights2XXLK.hdf5") print("predicting..") preds = model.predict(val_X2)#[:,0] oob_valpred[val_index,...] += preds cv_scores.append(log_loss(val_y, preds)) print(cv_scores) print(np.mean(cv_scores)) print(np.std(cv_scores)) predtst = (model.predict(test_X2))#[:,0] oob_tstpred += predtst oob_valpred /= nbag oob_tstpred /= (nfold*nbag) out_df = pd.DataFrame(oob_tstpred) out_df.columns = ["high", "medium", "low"] out_df["listing_id"] = test_df_listing_id out_df.to_csv("../keras_L2.csv", index=False)