In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import KFold
from sklearn.metrics import log_loss
from sklearn.preprocessing import  OneHotEncoder
from sklearn.preprocessing import StandardScaler
from scipy import stats
from keras.models import Sequential
from keras.layers import Dense
seed=7
np.random.seed(seed)

Using Theano backend.


In [2]:
def imputeDFColsUsingMedian(dataFrame,cols):
    for col in cols:
        medianOfCol=np.nanmedian(dataFrame[col])
        dataFrame[col].fillna(medianOfCol,inplace=True)
def imputeDFColsUsingMean(dataFrame,cols):
    for col in cols:
        meanOfCol=np.nanmean(dataFrame[col])
        dataFrame[col].fillna(meanOfCol,inplace=True)
def scaleFeature(dataFrame,col):
    maxVal=np.max(dataFrame[col])
    minVal=np.min(dataFrame[col])
    scaledDenom=maxVal-minVal
    dataFrame[col]=(dataFrame[col]-minVal)/scaledDenom
def labelEncodeFeats(dataFrame,listOfFeats):
    for feat in listOfFeats:
        labelEncoder=LabelEncoder()
        encodedFeatValues=labelEncoder.fit_transform(dataFrame[feat])
        dataFrame[feat]=encodedFeatValues
def OneHotEncodeFeats(dataFrame,listOfFeats,ctgrcl_ftrs_msk):
    labelEncodeFeats(dataFrame,listOfFeats)
    oneHotEncoder=OneHotEncoder(categorical_features=ctgrcl_ftrs_msk,sparse=False)
    oneHotEncodedFeats=oneHotEncoder.fit_transform(dataFrame)
    return oneHotEncodedFeats

In [3]:
app_events = pd.read_csv('../Data/app_events.csv')
app_labels = pd.read_csv('../Data/app_labels.csv')
events = pd.read_csv('../Data/events.csv')
events.timestamp=events.timestamp.map(lambda x:pd.Timestamp(x).value)
eventsGrpdByDeviceId=events.groupby('device_id')
gender_age_train = pd.read_csv('../Data/gender_age_train.csv')
gender_age_test = pd.read_csv('../Data/gender_age_test.csv')
label_categories = pd.read_csv('../Data/label_categories.csv')
phone_brand_device_model = pd.read_csv('../Data/phone_brand_device_model.csv',encoding='utf-8')
phone_brand_device_model = phone_brand_device_model.drop_duplicates('device_id',keep='first')

In [4]:
def joinBrandDeviceModel(deviceIDFrame,brandDeviceModelFrame):
    mergedDF=deviceIDFrame.merge(brandDeviceModelFrame[['device_id','phone_brand','device_model']], 
                                 how='left',on='device_id')
    mergedDF['phone_brand'].fillna('',inplace=True)
    mergedDF['device_model'].fillna('',inplace=True)
    return mergedDF
device_brand_model_trainDF=joinBrandDeviceModel(gender_age_train,phone_brand_device_model)
device_brand_model_testDF=joinBrandDeviceModel(gender_age_test,phone_brand_device_model)

In [5]:
def avgEventDuration(listOfTimeStamps):
    if(len(listOfTimeStamps)<1):
        return 0
    return (np.max(listOfTimeStamps)-np.min(listOfTimeStamps))/len(listOfTimeStamps)
def stdEventDuration(listOfTimeStamps):
    if(len(set(listOfTimeStamps))<=1):
        return 0
    return np.std(listOfTimeStamps)
def avglongChangeFreq(listOfLongs):
    if(len(listOfLongs)<1):
        return 0
    return len(set(listOfLongs))/float(len(listOfLongs))
def avgSqrdlongChangeAmt(listOfLongs):
    if(len(listOfLongs)<1):
        return 0
    return np.sum(np.diff(listOfLongs)**2)/float(len(listOfLongs))
def computeEventBasedFeatures(eventsGrpdByDeviceId):
    eventBasedAggregates=eventsGrpdByDeviceId.aggregate({'timestamp':[np.count_nonzero,avgEventDuration,
                            stdEventDuration],'longitude':[avglongChangeFreq,avgSqrdlongChangeAmt]})
    eventBasedAggregatesFeats=['num_of_evnts','avg_evnt_drtn','std_evnt_drtn','avgLongtdChgFrq','avgSqrdLongtdChgAmt']
    eventBasedAggregates.columns=eventBasedAggregatesFeats
    eventBasedAggregates['device_id']=eventBasedAggregates.index
    return (eventBasedAggregates,eventBasedAggregatesFeats) 
def segregateFeaturesDataFrame(dataFrameToSegregate,colToSegOn,valueToSegOn,segOnNan=False):
    if segOnNan:
        exclFlag=np.isnan(dataFrameToSegregate[colToSegOn])
    else:
        exclFlag=dataFrameToSegregate[colToSegOn]==valueToSegOn
    inclFlag=exclFlag==False
    exclDF=dataFrameToSegregate[exclFlag]
    incDF=dataFrameToSegregate[inclFlag]
    return (incDF,exclDF,inclFlag,exclFlag)

In [6]:
(eventBasedFeaturesDF,eventBasedFeats)=computeEventBasedFeatures(eventsGrpdByDeviceId)

In [7]:
def combineDeviceEventsBrandsFeatures(deviceDF,deviceEventsDF,deviceEventsFeats,deviceBrandsDF,shouldImpute=True):
    device_events_brands =deviceDF.merge(deviceEventsDF, how='left',on='device_id')
    if shouldImpute==True:
        imputeDFColsUsingMean(device_events_brands,deviceEventsFeats)
    device_events_brands=device_events_brands.merge(deviceBrandsDF[['phone_brand','device_model','device_id']], 
                                                  how='left',on='device_id')
    return device_events_brands
deviceEvntsBrnds_trainDF=combineDeviceEventsBrandsFeatures(gender_age_train,eventBasedFeaturesDF,
                                                   eventBasedFeats,device_brand_model_trainDF,False)
deviceEvntsBrnds_testDF=combineDeviceEventsBrandsFeatures(gender_age_test,eventBasedFeaturesDF,
                                                   eventBasedFeats,device_brand_model_testDF,False)
def getZeroIndexedTargetSeries(dataFrame,indexesToIncl,colName):
    dataFrame=dataFrame[indexesToIncl]
    targetSeries=dataFrame[colName].copy(deep=True)
    n=targetSeries.shape[0]
    targetSeries.index=np.arange(n)
    return targetSeries

In [11]:
(deviceEvntsBrndsIncl_trainDF,deviceEvntsBrndsExcl_trainDF,inclTrainFlag,exclTrainFlag)=segregateFeaturesDataFrame(
    deviceEvntsBrnds_trainDF,'num_of_evnts',0,True)
(deviceEvntsBrndsIncl_testDF,deviceEvntsBrndsExcl_testDF,inclTestFlag,exclTestFlag)=segregateFeaturesDataFrame(
    deviceEvntsBrnds_testDF,'num_of_evnts',0,True)
device_brand_model_incl_trainTarget=getZeroIndexedTargetSeries(device_brand_model_trainDF,inclTrainFlag,'group')
device_brand_model_excl_trainTarget=getZeroIndexedTargetSeries(device_brand_model_trainDF,exclTrainFlag,'group')

In [12]:
len(exclTrainFlag[exclTrainFlag])

51336

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
def combineTextFeatures(dataFrame,textFeatures,newFeatureName):
    dataFrame[newFeatureName]=dataFrame[textFeatures[0]]
    n=len(textFeatures)
    for i in np.arange(1,n):
        dataFrame[newFeatureName]=dataFrame[newFeatureName]+' '+dataFrame[textFeatures[i]]
def prepareTextFeature(trainDF,testDF,textFeatures,newFeatureName):
    combineTextFeatures(trainDF,textFeatures,newFeatureName)
    combineTextFeatures(testDF,textFeatures,newFeatureName)
    vectorizer = CountVectorizer(ngram_range=(1,2),min_df=0.0)
    vect_matrix = vectorizer.fit_transform(trainDF[newFeatureName])
    test_vect_matrix = vectorizer.transform(testDF[newFeatureName])
    return (vect_matrix,test_vect_matrix)
(textData_model_train,textData_model_test)=prepareTextFeature(device_brand_model_trainDF[exclTrainFlag],
                   device_brand_model_testDF[exclTestFlag],['phone_brand','device_model'],'brand_model')
#device_brand_model_trainDF['brand_model'] = device_brand_model_trainDF['phone_brand'] + ' ' +
#device_brand_model_trainDF['device_model']
#device_brand_model_testDF['brand_model'] = device_brand_model_testDF['phone_brand'] + ' ' + 
#device_brand_model_testDF['device_model']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [14]:
def processModelFeatures(contFeats,catFeats,modelDF,categorical_features_mask):
    contFeats=list(set(contFeats))
    catFeats=list(set(catFeats))
    modelFeatures=contFeats.copy()
    modelFeatures.extend(catFeats)
    model_subsetDF=modelDF[modelFeatures]
    for contFeat in contFeats:
        scaleFeature(model_subsetDF,contFeat)
    processedModelFeatures=OneHotEncodeFeats(model_subsetDF,catFeats,categorical_features_mask)
    return processedModelFeatures
catFeats=['phone_brand','device_model']
categorical_features_mask=[False,False,False,False,False,True,True]
modelFeatsIncl_trainDF=processModelFeatures(eventBasedFeats,catFeats,deviceEvntsBrndsIncl_trainDF,categorical_features_mask)
modelFeatsIncl_testDF=processModelFeatures(eventBasedFeats,catFeats,deviceEvntsBrndsIncl_testDF,categorical_features_mask)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [15]:
print(modelFeatsIncl_trainDF.shape,modelFeatsIncl_testDF.shape)#See, Here Not All Models Captured During Training Phase
#So Probably I can capture All Models Captured During Training-- 
#         If the new model not inside=> only use Model based out of Phone Model

(23309, 1020) (35194, 1157)


In [16]:
print(textData_model_train.shape)
print(device_brand_model_trainDF.shape,len(inclTrainFlag[inclTrainFlag]),len(exclTrainFlag[exclTrainFlag]))
print(device_brand_model_testDF.shape,len(inclTestFlag[inclTestFlag]),len(exclTestFlag[exclTestFlag]))

(51336, 2799)
(74645, 6) 23309 51336
(112071, 3) 35194 76877


In [17]:
from sklearn.cross_validation import train_test_split
def obtainProbs(X,y,model):
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3)
    clf=model.fit(X_train,y_train)
    y_pred=clf.predict_proba(X_test)
    return (y_pred,y_test)
def validateModel(X,y, model):
    kf = KFold(X.shape[0], n_folds=5, shuffle=True, random_state=0)
    for itrain, itest in kf:
        if type(X)==type(pd.DataFrame()):
            train=X.ix[itrain]
            test=X.ix[itest]
        else:
            train = X[itrain,:]
            test = X[itest,:]
        ytrain, ytest = y[itrain], y[itest]
        clf = model.fit(train,ytrain)
        ypred = clf.predict_proba(test)
        print(ypred.shape)
        print(log_loss(ytest, ypred))
        
def getModelOutput(X,y,X2, model):
    kf = KFold(X.shape[0], n_folds=5, shuffle=True, random_state=0)
    for itrain, itest in kf:
        if type(X)==type(pd.DataFrame()):
            train=X.ix[itrain]
            test=X.ix[itest]
        else:
            train = X[itrain,:]
            test = X[itest,:]
        ytrain, ytest = y[itrain], y[itest]
        clf = model.fit(train,ytrain)
        ypred = clf.predict_proba(X2)
        return ypred

In [18]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm  import LinearSVC
#class CalibModel(object):
class CalibModel(object):
    def __init__(self,clf):
        #clf = MultinomialNB()
        print("Obtained Classifier Instance ",clf)
        self.clf = CalibratedClassifierCV(clf, cv=2, method='sigmoid')
    
    def fit(self, X, y):
        self.clf.fit(X,y)
        return self
    
    def predict(self, X):
        return self.clf.predict(X)
    
    def predict_proba(self, X):
        return self.clf.predict_proba(X)

In [19]:
#(y_pred,y_test)=obtainProbs(vect_matrix, device_brand_model_trainDF['group'], CalibModel(MultinomialNB()))
#print(y_pred.shape,y_test.shape
#validateModel(vect_matrix, phone_brand_master['group'], CalibModel(LinearSVC()))
#validateModel(modelFeats_trainDF, device_brand_model_trainDF['group'], CalibModel(GaussianNB()))
from sklearn.ensemble import RandomForestClassifier
(y_pred,y_test)=obtainProbs(modelFeatsIncl_trainDF, device_brand_model_incl_trainTarget, 
              CalibModel(RandomForestClassifier(min_samples_split=1600,n_jobs=6,criterion='entropy')))
#y_pred.shape,y_test.shape
print(log_loss(y_test,y_pred))#2.3725889
type(y_test),type(y_pred)

Obtained Classifier Instance  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=1600,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=6,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
2.3750257931


(pandas.core.series.Series, numpy.ndarray)

In [20]:
(y_pred_text,y_test_text)=obtainProbs(textData_model_train, device_brand_model_excl_trainTarget, 
                                      CalibModel(MultinomialNB()))
print(log_loss(y_test_text,y_pred_text))

Obtained Classifier Instance  MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
2.40455445142


In [21]:
y_test_ensembled=y_test.append(y_test_text)
y_pred_ensembled=np.concatenate((y_pred,y_pred_text))
log_loss(y_test_ensembled,y_pred_ensembled)

2.3953335035056385

In [29]:
#Deep Learning : Keras Modelling
# create model
model = Sequential()
model.add(Dense(1600, input_dim=1020, init='uniform', activation='relu'))
model.add(Dense(1563, init='uniform', activation='relu'))
model.add(Dense(1, init='uniform', activation='sigmoid'))

In [30]:
# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [31]:
targetLabelEncoder=LabelEncoder()
encodedTarget_train=targetLabelEncoder.fit_transform(device_brand_model_incl_trainTarget)
encodedTarget_train

array([10,  4, 10, ...,  5, 10,  7])

In [38]:
X_train_dl,X_test_dl,y_train_dl,y_test_dl=train_test_split(modelFeatsIncl_trainDF,encodedTarget_train,test_size=0.3)
#clf_dl=model.fit(X_train_dl,y_train_dl)
clf_dl=model.fit(X_train_dl,y_train_dl,nb_epoch=2,batch_size=100)
y_pred_dl=clf_dl.model.predict_proba(X_test_dl)
log_loss(y_pred_dl,y_test_dl)

Epoch 1/2
Epoch 2/2

32.568849434219324

In [39]:
X_train_dl,X_test_dl,y_train_dl,y_test_dl=train_test_split(modelFeatsIncl_trainDF,encodedTarget_train,test_size=0.3)
#clf_dl=model.fit(X_train_dl,y_train_dl)
clf_dl=model.fit(X_train_dl,y_train_dl,nb_epoch=2,batch_size=1000)
y_pred_dl=clf_dl.model.predict_proba(X_test_dl)
log_loss(y_pred_dl,y_test_dl)

Epoch 1/2
Epoch 2/2


32.445370326567598

In [40]:
X_train_dl,X_test_dl,y_train_dl,y_test_dl=train_test_split(modelFeatsIncl_trainDF,encodedTarget_train,test_size=0.3)
#clf_dl=model.fit(X_train_dl,y_train_dl)
clf_dl=model.fit(X_train_dl,y_train_dl,nb_epoch=2,batch_size=10000)
y_pred_dl=clf_dl.model.predict_proba(X_test_dl)
log_loss(y_pred_dl,y_test_dl)

Epoch 1/2
Epoch 2/2


32.514518626852563

In [41]:
X_train_dl,X_test_dl,y_train_dl,y_test_dl=train_test_split(modelFeatsIncl_trainDF,encodedTarget_train,test_size=0.3)
#clf_dl=model.fit(X_train_dl,y_train_dl)
clf_dl=model.fit(X_train_dl,y_train_dl,nb_epoch=20,batch_size=10000)
y_pred_dl=clf_dl.model.predict_proba(X_test_dl)
log_loss(y_pred_dl,y_test_dl)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


32.475005312404008

In [42]:
X_train_dl,X_test_dl,y_train_dl,y_test_dl=train_test_split(modelFeatsIncl_trainDF,encodedTarget_train,test_size=0.3)
#clf_dl=model.fit(X_train_dl,y_train_dl)
clf_dl=model.fit(X_train_dl,y_train_dl,nb_epoch=20,batch_size=5000)
y_pred_dl=clf_dl.model.predict_proba(X_test_dl)
log_loss(y_pred_dl,y_test_dl)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


32.583666927137536