In [14]:
from __future__ import print_function
import keras
import pandas as pd
import numpy as np
from keras import layers
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import matplotlib.pylab as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.metrics import make_scorer
from sklearn.model_selection import StratifiedShuffleSplit
from matplotlib.colors import LogNorm
from sklearn.decomposition import PCA
from keras.layers.advanced_activations import PReLU, ELU
from keras.optimizers import RMSprop
from keras.layers import Dense, Dropout, Activation
from keras.layers.normalization import BatchNormalization
from keras.models import Sequential
from keras.utils import np_utils
from copy import deepcopy
from keras import metrics
from keras.models import load_model
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import LabelEncoder 
from keras.utils import multi_gpu_model
from keras.layers import LSTM
from sklearn.utils import shuffle as reset
############################################################
def train_test_split_DataFrame(data, test_size=0.2, shuffle=True, random_state=None):
    # Split DataFrame into random train and test subsets
        #     Parameters
        # ----------
        # data : pandas dataframe, need to split dataset.        
        # test_size : float
        #     If float, should be between 0.0 and 1.0 and represent the
        #     proportion of the dataset to include in the train split.            
        # random_state : int, RandomState instance or None, optional (default=None)
        #     If int, random_state is the seed used by the random number generator;
        #     If RandomState instance, random_state is the random number generator;
        #     If None, the random number generator is the RandomState instance used
        #     by `np.random`.            
        # shuffle : boolean, optional (default=None)
        #     Whether or not to shuffle the data before splitting. If shuffle=False
        #     then stratify must be None.
    if shuffle:
        data=reset(data, random_state=random_state)
    else:
        data=data.sort_values(by="Dates", ascending=True)
    train=data[int(len(data)*test_size):].reset_index(drop=True)
    test=data[:int(len(data)*test_size)].reset_index(drop=True)
    return train, test
#################################################################
def parse_time(x):
    #DD=datetime.strptime(x,"%Y/%m/%d %H:%M")#zj
    DD=datetime.strptime(x,"%Y-%m-%d %H:%M:%S")#jjs
    time=DD.hour#*60+DD.minute
    day=DD.day
    month=DD.month
    year=DD.year
    return time,day,month,year
def Dates2TDMY(x):
    #DD=datetime.strptime(x,"%Y/%m/%d %H:%M")#zj
    DD=datetime.strptime(x,"%Y-%m-%d %H:%M:%S")#jjs
    time=DD.hour#*60+DD.minute
    day=DD.day
    month=DD.month
    year=DD.year
    #T_D_M_Y=str(time)+str(day)+str(month)+str(year)
    T_D_M_Y=str(time)
    return T_D_M_Y
#################
def get_season(x):
    summer=0
    fall=0
    winter=0
    spring=0
    if (x in [5, 6, 7]):
        summer=1
    if (x in [8, 9, 10]):
        fall=1
    if (x in [11, 0, 1]):
        winter=1
    if (x in [2, 3, 4]):
        spring=1
    return summer, fall, winter, spring
#############################################
def field2Vec(trainDF,fieldStr):
    fields=sorted(trainDF[fieldStr].unique())
    categories=sorted(trainDF["Category"].unique())
    C_counts=trainDF.groupby(["Category"]).size()
    F_C_counts=trainDF.groupby([fieldStr,"Category"]).size()
    F_counts=trainDF.groupby([fieldStr]).size()
    logodds={}
    logoddsPF={}
    MIN_CAT_COUNTS=2
    default_logodds=np.log(C_counts/len(trainDF))-np.log(1.0-C_counts/float(len(trainDF)))
    for f in fields:
        PA=F_counts[f]/float(len(trainDF))
        logoddsPF[f]=np.log(PA)-np.log(1.-PA)
        logodds[f]=deepcopy(default_logodds)
        for cat in F_C_counts[f].keys():
            if (F_C_counts[f][cat]>MIN_CAT_COUNTS) and F_C_counts[f][cat]<F_counts[f]:
                PA=F_C_counts[f][cat]/float(F_counts[f])
                logodds[f][categories.index(cat)]=np.log(PA)-np.log(1.0-PA)
        logodds[f]=pd.Series(logodds[f])
        logodds[f].index=range(len(categories))
    return logodds,logoddsPF
####################################################
def parse_data(df,logodds_A,logoddsPF_A,logodds_T,logoddsPF_T):
    feature_list=df.columns.tolist()
    if "Descript" in feature_list:
        feature_list.remove("Descript")
    if "Resolution" in feature_list:
        feature_list.remove("Resolution")
    if "Category" in feature_list:
        feature_list.remove("Category")
    if "Id" in feature_list:
        feature_list.remove("Id")

    cleanData=df[feature_list]
    cleanData.index=range(len(df))
    print("Creating address features")###Creating address features###
    address_features=cleanData["Address"].apply(lambda x: logodds_A[x])
    address_features.columns=["logodds_A"+str(x) for x in range(len(address_features.columns))]
    print("Creating time T_D_M_Y features")###Creating time T_D_M_Y features###
    T_D_M_Y_features=cleanData["T_D_M_Y"].apply(lambda xx: logodds_T[xx])
    T_D_M_Y_features.columns=["logodds_T"+str(xx) for xx in range(len(T_D_M_Y_features.columns))]

    print("Parsing dates")            ###Creating address features###
    cleanData["Time"], cleanData["Day"], cleanData["Month"], cleanData["Year"]=zip(*cleanData["Dates"].apply(parse_time))
    #     dummy_ranks_DAY = pd.get_dummies(cleanData['DayOfWeek'], prefix='DAY')
    days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    #     cleanData["DayOfWeek"]=cleanData["DayOfWeek"].apply(lambda x: days.index(x)/float(len(days)))
    print("Creating one-hot variables")
    dummy_ranks_PD = pd.get_dummies(cleanData['PdDistrict'], prefix='PD')
    dummy_ranks_DAY = pd.get_dummies(cleanData["DayOfWeek"], prefix='DAY')
    cleanData["IsInterection"]=cleanData["Address"].apply(lambda x: 1 if "/" in x else 0)
    cleanData["logoddsPF_A"]=cleanData["Address"].apply(lambda x: logoddsPF_A[x])
    cleanData["logoddsPF_T"]=cleanData["T_D_M_Y"].apply(lambda x: logoddsPF_T[x])
    print("droping processed columns")
    cleanData=cleanData.drop("PdDistrict",axis=1)
    cleanData=cleanData.drop("DayOfWeek",axis=1)
    cleanData=cleanData.drop("Address",axis=1)
    cleanData=cleanData.drop("T_D_M_Y",axis=1)
    cleanData=cleanData.drop("Dates",axis=1)
    feature_list=cleanData.columns.tolist()
    print("joining one-hot features")
    features = cleanData[feature_list].join(dummy_ranks_PD.ix[:,:]).join(dummy_ranks_DAY.ix[:,:]).join(address_features.ix[:,:]).join(T_D_M_Y_features.ix[:,:])
    print("creating new features")
    features["IsDup"]=pd.Series(features.duplicated()|features.duplicated(keep='last')).apply(int)
    features["Awake"]=features["Time"].apply(lambda x: 1 if (x==0 or (x>=8 and x<=23)) else 0)
    features["Summer"], features["Fall"], features["Winter"], features["Spring"]=zip(*features["Month"].apply(get_season))
    if "Category" in df.columns:
        labels = df["Category"].astype('category')
    else:
        labels=None
    return features,labels
###################################################
def generator(X, Y, lookback, delay, min_index, max_index,
              shuffle=False, batch_size=128, step=6):
    if max_index is None:
        max_index = len(X) - delay - 1
    i = min_index + lookback
    while 1:
        if shuffle:
            rows = np.random.randint(
                min_index + lookback, max_index, size=batch_size)
        else:
            if i + batch_size >= max_index:
                i = min_index + lookback
            rows = np.arange(i, min(i + batch_size, max_index))
            i += len(rows)

        samples = np.zeros((len(rows),
                           lookback // step,
                           X.shape[-1]))
        targets = np.zeros((len(rows),Y.shape[1]))
        for j, row in enumerate(rows):
            indices = range(rows[j] - lookback, rows[j], step)
            samples[j] = X[indices]
            targets[j] = Y[rows[j]+delay]
        return samples, targets
    #Now here is the data generator that we will use. It yields a tuple (samples, targets) where samples is one batch of input data and targets is the corresponding array of target temperatures. It takes the following arguments:
        # •data: The original array of floating point data, which we just normalized in the code snippet above.
        # •lookback: How many timesteps back should our input data go.
        # •delay: How many timesteps in the future should our target be.
        # •min_index and max_index: Indices in the data array that delimit which timesteps to draw from. This is useful for keeping a segment of the data for validation and another one for testing.
        # •shuffle: Whether to shuffle our samples or draw them in chronological order.
        # •batch_size: The number of samples per batch.
        # •step: The period, in timesteps, at which we sample data. We will set it 6 in order to draw one data point every hour.
############################################################################
#Import data
NotConsiderTime=True##trainDF和testDF分割时是否考虑时间问题，即是非需要随机打乱
allDF=pd.read_csv("./train.csv")
print(allDF.shape)
print('Address_counts_allDF: ')
print(len(allDF["Address"].unique()))
trainDF,testDF=train_test_split_DataFrame(allDF, test_size=0.1, shuffle=NotConsiderTime, random_state=120)
print(trainDF.shape)
print('Address_counts_trainDF: ')
print(len(trainDF["Address"].unique()))
print(testDF.shape)
print('Address_counts_testDF: ' )
print(len(testDF["Address"].unique()))

trainDF_addrs = trainDF['Address'].tolist()
testDF_addrs = testDF['Address'].tolist()
differentKey = [x for x in testDF_addrs if x not in trainDF_addrs]

#################Now proceed as before#################
xy_scaler=preprocessing.StandardScaler()
xy_scaler.fit(trainDF[["X","Y"]])
trainDF[["X","Y"]]=xy_scaler.transform(trainDF[["X","Y"]])

trainDF["T_D_M_Y"]=trainDF["Dates"].apply(Dates2TDMY)
trainDF["T_D_M_Y"]=trainDF["T_D_M_Y"]+trainDF["DayOfWeek"]
print('-----------LOGOODS: T_D_M_Y-------------')
logodds_T,logoddsPF_T=field2Vec(trainDF,"T_D_M_Y")
print('-----------LOGOODS: Address-------------')
logodds_A,logoddsPF_A=field2Vec(trainDF,"Address")
print('-----------LOGOODS: parse_data-------------')
features, labels=parse_data(trainDF,logodds_A,logoddsPF_A,logodds_T,logoddsPF_T)    
print(features.columns.tolist())
print(len(features.columns))

collist=features.columns.tolist()
scaler = preprocessing.StandardScaler()
scaler.fit(features)
features[collist]=scaler.transform(features)
######################################################
N_EPOCHS=21
N_HN=256
N_HN_1=512
N_LAYERS=2
N_BATCH=64
DP=0.5
N_CLASS=len(labels.unique())
#############################先进行过采样，然后再根据时间来排序##################################
print('------------RandomOverSampler--------------')
ros = RandomOverSampler(random_state=0)
featuresArray, labelsArray = ros.fit_resample(features.values,labels.values)#####过采样#####
    #####按照年（第6列）月（第5列）日（第4列）时（第3列）排序
print('------------Sort--------------')
time_temp=featuresArray[:,2]+np.dot(featuresArray[:,3],100)+np.dot(featuresArray[:,4],10000)+np.dot(featuresArray[:,5],1000000)
features_label_time=np.column_stack((featuresArray,labelsArray))
features_label_time=np.column_stack((features_label_time,time_temp))
features_label_time =features_label_time[np.argsort(features_label_time[:,-1])]
labelsArray=features_label_time[:,-2]
featuresArray=features_label_time[:,0:featuresArray.shape[1]]
del features_label_time
#############################先进行过采样，然后再根据时间来排序----结束############################
print('------------Preparing test datas--------------')
###########和训练集使用同样的时间和地点Logoodds值#####
# logodds_A,logoddsPF_A=field2Vec(testDF,"Address")
# trainDF["T_D_M_Y"]=trainDF["Dates"].apply(Dates2TDMY)
# trainDF["T_D_M_Y"]=trainDF["T_D_M_Y"]+trainDF["DayOfWeek"]
# logodds_T,logoddsPF_T=field2Vec(testDF,"T_D_M_Y")
testDF[["X","Y"]]=xy_scaler.transform(testDF[["X","Y"]])
testDF["T_D_M_Y"]=testDF["Dates"].apply(Dates2TDMY)
testDF["T_D_M_Y"]=testDF["T_D_M_Y"]+testDF["DayOfWeek"]
features_test, labels_test=parse_data(testDF,logodds_A,logoddsPF_A,logodds_T,logoddsPF_T)
# collist=features.columns.tolist()
# scaler = preprocessing.StandardScaler()
# scaler.fit(features_test)
features_test[collist]=scaler.transform(features_test)
x_test=features_test.values
y_test=labels_test.values
y_test = keras.utils.to_categorical(LabelEncoder().fit_transform(np.array(y_test)), num_classes=N_CLASS)
    
OnlyLSTM=True#False
if OnlyLSTM:
    SORTbyTime=False #是否需要根据时间顺序，留出后Test_size个样本用于测试
    TestRate=0.2 #当SORTbyTime=False 时，该值才起作用
    Test_size=200000#当SORTbyTime=True 时，该值才起作用
    split_count=1#当SORTbyTime=True 时，该值才起作用
    split_size=int(Test_size/split_count)#当SORTbyTime=True 时，该值才起作用
    N_hight=featuresArray.shape[0]
    print('------------train_val_split--------------')
    for t_i in range(split_count):
        if SORTbyTime:
            # t_i=t0_i+1
            print('--------NNN_spllit_NNN_spllit_NNN_spllit_NNN_spllit_---------')
            print(t_i)
            x_train=featuresArray[0:N_hight-Test_size+t_i*split_size,:]
            y_train=labelsArray[0:N_hight-Test_size+t_i*split_size]

            x_val=featuresArray[N_hight-Test_size+t_i*split_size:N_hight-Test_size+(t_i+1)*split_size,:]
            y_val=labelsArray[N_hight-Test_size+t_i*split_size:N_hight-Test_size+(t_i+1)*split_size]
        else:
            print('------------train_val_split_Shuffle--------------')
            x_train,x_val,y_train,y_val = train_test_split(featuresArray,labelsArray,test_size=TestRate,shuffle=True)
        print('------------to_categorical--------------')
        y_train = keras.utils.to_categorical(LabelEncoder().fit_transform(np.array(y_train)), num_classes=N_CLASS)
        y_val = keras.utils.to_categorical(LabelEncoder().fit_transform(np.array(y_val)), num_classes=N_CLASS)


        ##########################################################################
        print('------------Building model--------------')
        input_dim=x_train.shape[1]
        output_dim=N_CLASS
        model = Sequential()
        model.add(Dense(N_HN_1,input_dim=input_dim,init='glorot_uniform'))
        model.add(BatchNormalization())
        model.add(PReLU())
        # model.add(Dropout(dp))
        for i in range(N_LAYERS):
            model.add(Dense(N_HN, init='glorot_uniform'))
            model.add(BatchNormalization())    
            model.add(PReLU())    
        #   model.add(Dropout(dp))
        model.add(BatchNormalization())
        model.add(Dense(output_dim, init='glorot_uniform'))
        model.add(Activation('softmax'))
        # model = multi_gpu_model(model, 2)
        model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy', metrics.top_k_categorical_accuracy])
        if NotOnlyLSTM:
            print('------------Go! Go! Go!!!!-----------')
            fitting=model.fit(x_train, y_train, epochs=N_EPOCHS, batch_size=N_BATCH,verbose=1,validation_data=(x_val,y_val))
            # acc_test, test_score,fitting, model = build_and_fit_model(features_train.values,labels_train,x_val=features_test.values,y_test=labels_test,hn=N_HN,layers=N_LAYERS,epochs=N_EPOCHS,verbose=2,dp=DP)
            # model.save('jjs_model_0112.h5')
            print('-----------Evaluate-------------------')
            acc_test = model.evaluate(x_test,y_test, batch_size=N_BATCH)
            print(acc_test)
            if SORTbyTime:
                del model
else:
    print('------------Begin LSTM--------------')
    featuresLSTM=features.values
    labelsLSTM = labels.values
        #####按照年（第6列）月（第5列）日（第4列）时（第3列）排序
    time_temp=featuresLSTM[:,2]+np.dot(featuresLSTM[:,3],100)+np.dot(featuresLSTM[:,4],10000)+np.dot(featuresLSTM[:,5],1000000)
    features_label_time=np.column_stack((featuresLSTM,labelsLSTM))
    features_label_time=np.column_stack((features_label_time,time_temp))
    features_label_time =features_label_time[np.argsort(features_label_time[:,-1])]
    featuresLSTM=features_label_time[:,0:featuresLSTM.shape[1]]
    labelsLSTM=features_label_time[:,-2]
    labelsLSTM = keras.utils.to_categorical(LabelEncoder().fit_transform(np.array(labelsLSTM)), num_classes=N_CLASS)
    del features_label_time

    N_EPOCHS=40
    N_HN=256
    N_LAYERS=1
    N_BATCH=64
    lookback=10240
    size_Train=x_train.shape[0]
    input_dim=x_train.shape[1]
    output_dim=N_CLASS
    print('--------------------------generator Train_set and Val_set for LSTM---------------------------------')
    train_X, train_Y=generator(featuresLSTM, labelsLSTM, lookback=lookback, delay=1, min_index=0, max_index=size_Train, shuffle=True, batch_size=N_BATCH, step=1)
    val_X, val_Y=generator(featuresLSTM, labelsLSTM, lookback=lookback, delay=1, min_index=size_Train+1, max_index=None, shuffle=True, batch_size=N_BATCH, step=1)

    LSTMmodel = Sequential()
    LSTMmodel.add(layers.Flatten(input_shape=(lookback, input_dim)))
    LSTMmodel.add(layers.Dense(N_HN, activation='relu'))
    LSTMmodel.add(layers.Dense(output_dim))
    LSTMmodel.compile(optimizer=RMSprop(), loss='categorical_crossentropy',metrics=['accuracy', metrics.top_k_categorical_accuracy])
    # LSTMmodel.compile(optimizer='adam', loss='categorical_crossentropy',metrics=['accuracy', metrics.top_k_categorical_accuracy])
    print('---------------------------------------LSTM GO GO GO!!!!---------------------------------------------')
    history = LSTMmodel.fit(train_X,train_Y, epochs=N_EPOCHS,verbose=2,validation_data=(val_X, val_Y))

    print('--------------------------LSTM ReTraining On trainSet and valSet!!!!---------------------------------')
    train_X_ALL, train_Y_ALL=generator(featuresLSTM, labelsLSTM, lookback=lookback, delay=1, min_index=0, max_index=None, shuffle=True, batch_size=N_BATCH, step=1)
    historyAll = LSTMmodel.fit(train_X_ALL, train_Y_ALL, epochs=N_EPOCHS,verbose=2,validation_data=(val_X, val_Y))

    print('-----------LSTM Evaluate ALL-------------------')
    ####

    #需要将train_X_ALL和test_X拼接在一起，先计算一下train_X_ALL的行数，然后，根据lookback来确定test_X从哪一行开始

    ####
    test_X, test_Y=generator(x_test, y_test, lookback=lookback, delay=1, min_index=0, max_index=None, shuffle=True, batch_size=N_BATCH, step=1)
    acc_test = LSTMmodel.evaluate(test_X,test_Y, batch_size=N_BATCH)
    print(acc_test)

(878049, 9)
Address_counts_allDF: 
23228
(790245, 9)
Address_counts_trainDF: 
23003
(87804, 9)
Address_counts_testDF: 
14995
-----------LOGOODS: T_D_M_Y-------------
-----------LOGOODS: Address-------------
-----------LOGOODS: parse_data-------------
Creating address features
Creating time T_D_M_Y features
Parsing dates


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Creating one-hot variables
droping processed columns
joining one-hot features


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated


creating new features
['X', 'Y', 'Time', 'Day', 'Month', 'Year', 'IsInterection', 'logoddsPF_A', 'logoddsPF_T', 'PD_BAYVIEW', 'PD_CENTRAL', 'PD_INGLESIDE', 'PD_MISSION', 'PD_NORTHERN', 'PD_PARK', 'PD_RICHMOND', 'PD_SOUTHERN', 'PD_TARAVAL', 'PD_TENDERLOIN', 'DAY_Friday', 'DAY_Monday', 'DAY_Saturday', 'DAY_Sunday', 'DAY_Thursday', 'DAY_Tuesday', 'DAY_Wednesday', 'logodds_A0', 'logodds_A1', 'logodds_A2', 'logodds_A3', 'logodds_A4', 'logodds_A5', 'logodds_A6', 'logodds_A7', 'logodds_A8', 'logodds_A9', 'logodds_A10', 'logodds_A11', 'logodds_A12', 'logodds_A13', 'logodds_A14', 'logodds_A15', 'logodds_A16', 'logodds_A17', 'logodds_A18', 'logodds_A19', 'logodds_A20', 'logodds_A21', 'logodds_A22', 'logodds_A23', 'logodds_A24', 'logodds_A25', 'logodds_A26', 'logodds_A27', 'logodds_A28', 'logodds_A29', 'logodds_A30', 'logodds_A31', 'logodds_A32', 'logodds_A33', 'logodds_A34', 'logodds_A35', 'logodds_A36', 'logodds_A37', 'logodds_A38', 'logodds_T0', 'logodds_T1', 'logodds_T2', 'logodds_T3', 'logod

KeyError: 'HELEN ST / CALIFORNIA ST'