In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import KFold
from sklearn.metrics import log_loss
from sklearn.preprocessing import  OneHotEncoder
from sklearn.preprocessing import StandardScaler
from scipy import stats
from keras.models import Sequential
from keras.layers import Dense
seed=7
np.random.seed(seed)

Using Theano backend.


In [2]:
def imputeDFColsUsingMedian(dataFrame,cols):
    for col in cols:
        medianOfCol=np.nanmedian(dataFrame[col])
        dataFrame[col].fillna(medianOfCol,inplace=True)
def imputeDFColsUsingMean(dataFrame,cols):
    for col in cols:
        meanOfCol=np.nanmean(dataFrame[col])
        dataFrame[col].fillna(meanOfCol,inplace=True)
def scaleFeature(dataFrame,col):
    maxVal=np.max(dataFrame[col])
    minVal=np.min(dataFrame[col])
    scaledDenom=maxVal-minVal
    dataFrame[col]=(dataFrame[col]-minVal)/scaledDenom
def labelEncodeFeats(dataFrame,listOfFeats):
    for feat in listOfFeats:
        labelEncoder=LabelEncoder()
        encodedFeatValues=labelEncoder.fit_transform(dataFrame[feat])
        dataFrame[feat]=encodedFeatValues
def OneHotEncodeFeats(dataFrame,listOfFeats,ctgrcl_ftrs_msk):
    labelEncodeFeats(dataFrame,listOfFeats)
    oneHotEncoder=OneHotEncoder(categorical_features=ctgrcl_ftrs_msk,sparse=False)
    oneHotEncodedFeats=oneHotEncoder.fit_transform(dataFrame)
    return oneHotEncodedFeats

In [3]:
app_events = pd.read_csv('../Data/app_events.csv')
app_labels = pd.read_csv('../Data/app_labels.csv')
events = pd.read_csv('../Data/events.csv')
events.timestamp=events.timestamp.map(lambda x:pd.Timestamp(x).value)
eventsGrpdByDeviceId=events.groupby('device_id')
gender_age_train = pd.read_csv('../Data/gender_age_train.csv')
gender_age_test = pd.read_csv('../Data/gender_age_test.csv')
label_categories = pd.read_csv('../Data/label_categories.csv')
phone_brand_device_model = pd.read_csv('../Data/phone_brand_device_model.csv',encoding='utf-8')
phone_brand_device_model = phone_brand_device_model.drop_duplicates('device_id',keep='first')

In [4]:
def joinBrandDeviceModel(deviceIDFrame,brandDeviceModelFrame):
    mergedDF=deviceIDFrame.merge(brandDeviceModelFrame[['device_id','phone_brand','device_model']], 
                                 how='left',on='device_id')
    mergedDF['phone_brand'].fillna('',inplace=True)
    mergedDF['device_model'].fillna('',inplace=True)
    return mergedDF
device_brand_model_trainDF=joinBrandDeviceModel(gender_age_train,phone_brand_device_model)
device_brand_model_testDF=joinBrandDeviceModel(gender_age_test,phone_brand_device_model)

In [6]:
def avgEventDuration(listOfTimeStamps):
    if(len(listOfTimeStamps)<1):
        return 0
    return (np.max(listOfTimeStamps)-np.min(listOfTimeStamps))/len(listOfTimeStamps)
def stdEventDuration(listOfTimeStamps):
    if(len(set(listOfTimeStamps))<=1):
        return 0
    return np.std(listOfTimeStamps)
def avglongChangeFreq(listOfLongs):
    if(len(listOfLongs)<1):
        return 0
    return len(set(listOfLongs))/float(len(listOfLongs))
def avgSqrdlongChangeAmt(listOfLongs):
    if(len(listOfLongs)<1):
        return 0
    return np.sum(np.diff(listOfLongs)**2)/float(len(listOfLongs))
def computeEventBasedFeatures(eventsGrpdByDeviceId):
    eventBasedAggregates=eventsGrpdByDeviceId.aggregate({'timestamp':[np.count_nonzero,avgEventDuration,
                            stdEventDuration],'longitude':[avglongChangeFreq,avgSqrdlongChangeAmt]})
    eventBasedAggregatesFeats=['num_of_evnts','avg_evnt_drtn','std_evnt_drtn','avgLongtdChgFrq','avgSqrdLongtdChgAmt']
    eventBasedAggregates.columns=eventBasedAggregatesFeats
    eventBasedAggregates['device_id']=eventBasedAggregates.index
    return (eventBasedAggregates,eventBasedAggregatesFeats) 
def segregateFeaturesDataFrame(dataFrameToSegregate,colToSegOn,valueToSegOn,segOnNan=False):
    if segOnNan:
        exclFlag=np.isnan(dataFrameToSegregate[colToSegOn])
    else:
        exclFlag=dataFrameToSegregate[colToSegOn]==valueToSegOn
    inclFlag=exclFlag==False
    exclDF=dataFrameToSegregate[exclFlag]
    incDF=dataFrameToSegregate[inclFlag]
    return (incDF,exclDF,inclFlag,exclFlag)

In [7]:
(eventBasedFeaturesDF,eventBasedFeats)=computeEventBasedFeatures(eventsGrpdByDeviceId)

In [8]:
def combineDeviceEventsBrandsFeatures(deviceDF,deviceEventsDF,deviceEventsFeats,deviceBrandsDF,shouldImpute=True):
    device_events_brands =deviceDF.merge(deviceEventsDF, how='left',on='device_id')
    if shouldImpute==True:
        imputeDFColsUsingMean(device_events_brands,deviceEventsFeats)
    device_events_brands=device_events_brands.merge(deviceBrandsDF[['phone_brand','device_model','device_id']], 
                                                  how='left',on='device_id')
    return device_events_brands
deviceEvntsBrnds_trainDF=combineDeviceEventsBrandsFeatures(gender_age_train,eventBasedFeaturesDF,
                                                   eventBasedFeats,device_brand_model_trainDF,False)
deviceEvntsBrnds_testDF=combineDeviceEventsBrandsFeatures(gender_age_test,eventBasedFeaturesDF,
                                                   eventBasedFeats,device_brand_model_testDF,False)
def getZeroIndexedTargetSeries(dataFrame,indexesToIncl,colName):
    dataFrame=dataFrame[indexesToIncl]
    targetSeries=dataFrame[colName].copy(deep=True)
    n=targetSeries.shape[0]
    targetSeries.index=np.arange(n)
    return targetSeries

In [9]:
(deviceEvntsBrndsIncl_trainDF,deviceEvntsBrndsExcl_trainDF,inclTrainFlag,exclTrainFlag)=segregateFeaturesDataFrame(
    deviceEvntsBrnds_trainDF,'num_of_evnts',0,True)
(deviceEvntsBrndsIncl_testDF,deviceEvntsBrndsExcl_testDF,inclTestFlag,exclTestFlag)=segregateFeaturesDataFrame(
    deviceEvntsBrnds_testDF,'num_of_evnts',0,True)
device_brand_model_incl_trainTarget=getZeroIndexedTargetSeries(device_brand_model_trainDF,inclTrainFlag,'group')
#device_brand_model_incl_testTarget=getZeroIndexedTargetSeries(device_brand_model_testDF,inclTestFlag,'group')

In [10]:
def processModelFeatures(contFeats,catFeats,modelDF,categorical_features_mask):
    contFeats=list(set(contFeats))
    catFeats=list(set(catFeats))
    modelFeatures=contFeats.copy()
    modelFeatures.extend(catFeats)
    model_subsetDF=modelDF[modelFeatures]
    for contFeat in contFeats:
        scaleFeature(model_subsetDF,contFeat)
    processedModelFeatures=OneHotEncodeFeats(model_subsetDF,catFeats,categorical_features_mask)
    return processedModelFeatures
catFeats=['phone_brand','device_model']
categorical_features_mask=[False,False,False,False,False,True,True]
modelFeatsIncl_trainDF=processModelFeatures(eventBasedFeats,catFeats,deviceEvntsBrndsIncl_trainDF,categorical_features_mask)
modelFeatsIncl_testDF=processModelFeatures(eventBasedFeats,catFeats,deviceEvntsBrndsIncl_testDF,categorical_features_mask)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [11]:
print(modelFeatsIncl_trainDF.shape,modelFeatsIncl_testDF.shape)#See, Here Not All Models Captured During Training Phase
#So Probably I can capture All Models Captured During Training-- 
#         If the new model not inside=> only use Model based out of Phone Model

(23309, 1020) (35194, 1157)


In [12]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range=(1,2),min_df=0.0)
device_brand_model_trainDF['brand_model'] = device_brand_model_trainDF['phone_brand'] + ' ' + device_brand_model_trainDF['device_model']
device_brand_model_testDF['brand_model'] = device_brand_model_testDF['phone_brand'] + ' ' + device_brand_model_testDF['device_model']
vect_matrix = vectorizer.fit_transform(device_brand_model_trainDF['brand_model'])
test_vect_matrix = vectorizer.transform(device_brand_model_testDF['brand_model'])

In [13]:
def validateModel(X,y, model):
    kf = KFold(X.shape[0], n_folds=5, shuffle=True, random_state=0)
    for itrain, itest in kf:
        if type(X)==type(pd.DataFrame()):
            train=X.ix[itrain]
            test=X.ix[itest]
        else:
            train = X[itrain,:]
            test = X[itest,:]
        ytrain, ytest = y[itrain], y[itest]
        clf = model.fit(train,ytrain)
        ypred = clf.predict_proba(test)
        print(ypred.shape)
        print(log_loss(ytest, ypred))
        
def getModelOutput(X,y,X2, model):
    kf = KFold(X.shape[0], n_folds=5, shuffle=True, random_state=0)
    for itrain, itest in kf:
        if type(X)==type(pd.DataFrame()):
            train=X.ix[itrain]
            test=X.ix[itest]
        else:
            train = X[itrain,:]
            test = X[itest,:]
        ytrain, ytest = y[itrain], y[itest]
        clf = model.fit(train,ytrain)
        ypred = clf.predict_proba(X2)
        return ypred

In [14]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm  import LinearSVC
#class CalibModel(object):
class CalibModel(object):
    def __init__(self,clf):
        #clf = MultinomialNB()
        print("Obtained Classifier Instance ",clf)
        self.clf = CalibratedClassifierCV(clf, cv=2, method='sigmoid')
    
    def fit(self, X, y):
        self.clf.fit(X,y)
        return self
    
    def predict(self, X):
        return self.clf.predict(X)
    
    def predict_proba(self, X):
        return self.clf.predict_proba(X)

In [19]:
#validateModel(vect_matrix, device_brand_model_trainDF['group'], CalibModel(MultinomialNB()))
#validateModel(vect_matrix, phone_brand_master['group'], CalibModel(LinearSVC()))
#validateModel(modelFeats_trainDF, device_brand_model_trainDF['group'], CalibModel(GaussianNB()))
from sklearn.ensemble import RandomForestClassifier
validateModel(modelFeatsIncl_trainDF, device_brand_model_incl_trainTarget, 
              CalibModel(RandomForestClassifier(min_samples_split=1600,n_jobs=6,criterion='entropy')))
#validateModel(eventsData_train_final,eventsData_train.group,CalibModel(GaussianNB()))
#from sklearn.ensemble import RandomForestClassifier
#validateModel(modelFeats_trainDF, device_brand_model_trainDF['group'], CalibModel(RandomForestClassifier()))
#validateModel(modelFeats_trainDF, device_brand_model_trainDF['group'], CalibModel(DecisionTreeClassifier()))
#from sklearn.ensemble import AdaBoostClassifier
#validateModel(modelFeats_trainDF, device_brand_model_trainDF['group'], CalibModel(AdaBoostClassifier()))

Obtained Classifier Instance  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=1600,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=6,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
(4662, 12)
2.37830472243
(4662, 12)
2.38019372487
(4662, 12)
2.3716000584
(4662, 12)
2.36886065791
(4661, 12)
2.37154242039


In [22]:
from sklearn.linear_model import LogisticRegression
validateModel(modelFeatsIncl_trainDF, device_brand_model_incl_trainTarget, 
              CalibModel(LogisticRegression(C=.2)))

Obtained Classifier Instance  LogisticRegression(C=0.2, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
(4662, 12)
2.37281693665
(4662, 12)
2.37542059291
(4662, 12)
2.36903857821
(4662, 12)
2.3689221964
(4661, 12)
2.3697267725


In [23]:
from sklearn.linear_model import LogisticRegression
validateModel(modelFeatsIncl_trainDF, device_brand_model_incl_trainTarget, 
              CalibModel(LogisticRegression(C=.1)))

Obtained Classifier Instance  LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
(4662, 12)
2.37354226546
(4662, 12)
2.3760013225
(4662, 12)
2.36916845846
(4662, 12)
2.36911745916
(4661, 12)
2.36952981101


In [29]:
from sklearn.ensemble import AdaBoostClassifier
validateModel(modelFeatsIncl_trainDF, device_brand_model_incl_trainTarget, 
              CalibModel(AdaBoostClassifier(base_estimator=DecisionTreeClassifier(min_samples_split=1600))))

Obtained Classifier Instance  AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=1600, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
          learning_rate=1.0, n_estimators=50, random_state=None)
(4662, 12)
2.40105301908
(4662, 12)
2.40750961304
(4662, 12)
2.39790409408
(4662, 12)
2.40107712408
(4661, 12)
2.39904907009


In [24]:
#Deep Learning : Keras Modelling
# create model
model = Sequential()
model.add(Dense(1600, input_dim=1563, init='uniform', activation='relu'))
model.add(Dense(1563, init='uniform', activation='relu'))
model.add(Dense(1, init='uniform', activation='sigmoid'))

In [25]:
# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [29]:
targetLabelEncoder=LabelEncoder()
encodedTarget_train=targetLabelEncoder.fit_transform(device_brand_model_trainDF['group'])
encodedTarget_train

array([10, 10, 10, ...,  6, 10,  7])

In [None]:
# Fit the model
model.fit(modelFeats_trainDF, encodedTarget_train, nb_epoch=150, batch_size=10)

Epoch 1/150
 8290/74645 [==>...........................] - ETA: 640s - loss: -86.3842 - acc: 0.0560

In [23]:
modelFeats_trainDF.shape

(74645, 1563)