In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import KFold
from sklearn.metrics import log_loss
from sklearn.preprocessing import  OneHotEncoder

In [2]:
app_events = pd.read_csv('../Data/app_events.csv')
app_labels = pd.read_csv('../Data/app_labels.csv')
events = pd.read_csv('../Data/events.csv')
gender_age_train = pd.read_csv('../Data/gender_age_train.csv')
gender_age_test = pd.read_csv('../Data/gender_age_test.csv')
label_categories = pd.read_csv('../Data/label_categories.csv')
phone_brand_device_model = pd.read_csv('../Data/phone_brand_device_model.csv',encoding='utf-8')
phone_brand_device_model = phone_brand_device_model.drop_duplicates('device_id',keep='first')

In [3]:
#Categorical Features:Device Brand(Cardinality - 131) and Device Model (Cardinality - 1599)
phone_brand_master = gender_age_train.merge(phone_brand_device_model[['device_id','phone_brand','device_model']], how='left',on='device_id')
phone_brand_master['brand_model'] = phone_brand_master['phone_brand'] + ' ' + phone_brand_master['device_model']
phone_brand_master.brand_model.fillna('',inplace=True)
phone_brand_master_test = gender_age_test.merge(phone_brand_device_model[['device_id','phone_brand','device_model']], how='left',on='device_id')
phone_brand_master_test['brand_model'] = phone_brand_master_test['phone_brand'] + ' ' + phone_brand_master_test['device_model']
phone_brand_master_test.brand_model.fillna('',inplace=True)

In [16]:
from sklearn.preprocessing import StandardScaler
from scipy import stats
def imputeDFColsUsingMedian(dataFrame,cols):
    for col in cols:
        medianOfCol=np.nanmedian(dataFrame[col])
        dataFrame[col].fillna(medianOfCol,inplace=True)
def imputeDFColsUsingMean(dataFrame,cols):
    for col in cols:
        meanOfCol=np.nanmean(dataFrame[col])
        dataFrame[col].fillna(meanOfCol,inplace=True)
def scaleFeature(dataFrame,col):
    maxVal=np.max(dataFrame[col])
    minVal=np.min(dataFrame[col])
    scaledDenom=maxVal-minVal
    dataFrame[col]=(dataFrame[col]-minVal)/scaledDenom
def labelEncodeFeats(dataFrame,listOfFeats):
    for feat in listOfFeats:
        #print("Encoding On ",feat," Shape ",dataFrame[feat].shape)
        labelEncoder=LabelEncoder()
        encodedFeatValues=labelEncoder.fit_transform(dataFrame[feat])
        dataFrame[feat]=encodedFeatValues
def OneHotEncodeFeats(dataFrame,listOfFeats,ctgrcl_ftrs_msk):
    #print("List of Feats to Encode ",listOfFeats)
    labelEncodeFeats(dataFrame,listOfFeats)
    #print("Label Encoding Done")
    oneHotEncoder=OneHotEncoder(categorical_features=ctgrcl_ftrs_msk,sparse=False)
    #print("Got Object of OneHot")
    print("Total Cols ",dataFrame.columns)
    oneHotEncodedFeats=oneHotEncoder.fit_transform(dataFrame)
    print("Shape ",oneHotEncodedFeats.shape)
    return oneHotEncodedFeats

In [None]:
events.timestamp=events.timestamp.map(lambda x:pd.Timestamp(x).value)
deviceEvents=events.groupby('device_id')

In [8]:
def avgEventDuration(listOfTimeStamps):
    if(len(listOfTimeStamps)<1):
        return 0
    return (np.max(listOfTimeStamps)-np.min(listOfTimeStamps))/len(listOfTimeStamps)
def stdEventDuration(listOfTimeStamps):
    if(len(set(listOfTimeStamps))<=1):
        return 0
    return np.std(listOfTimeStamps)
def avglongChangeFreq(listOfLongs):
    if(len(listOfLongs)<1):
        return 0
    return len(set(listOfLongs))/float(len(listOfLongs))
def avgSqrdlongChangeAmt(listOfLongs):
    if(len(listOfLongs)<1):
        return 0
    return np.sum(np.diff(listOfLongs)**2)/float(len(listOfLongs))
eventBasedAggregates=deviceEvents.aggregate({'timestamp':[np.count_nonzero,avgEventDuration,stdEventDuration],
                                            'longitude':[avglongChangeFreq,avgSqrdlongChangeAmt]})
eventBasedAggregatesFeats=['num_of_evnts','avg_evnt_drtn','std_evnt_drtn','avgLongtdChgFrq','avgSqrdLongtdChgAmt']
modelFeatures=eventBasedAggregatesFeats.copy()
eventBasedAggregates.columns=eventBasedAggregatesFeats
eventBasedAggregates['device_id']=eventBasedAggregates.index

In [9]:
eventBasedAggregates[0:3]

Unnamed: 0_level_0,num_of_evnts,avg_evnt_drtn,std_evnt_drtn,avgLongtdChgFrq,avgSqrdLongtdChgAmt,device_id
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
-9222956879900151005,0.030769,3156.504025,65,1758338461538,37541752542466,-9222956879900151005
-9222661944218806987,0.125,0.0,8,72986000000000,193413022978676,-9222661944218806987
-9222399302879214035,0.1,0.0,10,37490800000000,101030283478568,-9222399302879214035


In [10]:
eventsData_train =gender_age_train.merge(eventBasedAggregates, how='left',on='device_id')
imputeDFColsUsingMean(eventsData_train,eventBasedAggregatesFeats)
eventsData_train =eventsData_train.merge(phone_brand_master[['phone_brand','device_id']], how='left',on='device_id')
eventsData_train =eventsData_train.merge(phone_brand_master[['device_model','device_id']], how='left',on='device_id')
eventsData_train[0:3]

Unnamed: 0,device_id,gender,age,group,num_of_evnts,avg_evnt_drtn,std_evnt_drtn,avgLongtdChgFrq,avgSqrdLongtdChgAmt,phone_brand,device_model
0,-8076087639492063270,M,35,M32-38,0.236296,1081.402878,52.151315,24124370000000.0,111669700000000.0,小米,MI 2
1,-2897161552818060146,M,35,M32-38,0.236296,1081.402878,52.151315,24124370000000.0,111669700000000.0,小米,MI 2
2,-8260683887967679142,M,35,M32-38,1.0,0.0,1.0,0.0,0.0,小米,MI 2


In [17]:
modelFeatures.extend(['phone_brand','device_model'])
modelFeatures=list(set(modelFeatures))
eventsData_train_final=eventsData_train[modelFeatures]
eventBasedAggregatesFeats=list(set(eventBasedAggregatesFeats))
for feat in eventBasedAggregatesFeats:
    scaleFeature(eventsData_train_final,feat)
categorical_features_mask=[False,False,False,False,False,True,True]
featured_model_train=OneHotEncodeFeats(eventsData_train_final,['phone_brand','device_model'],categorical_features_mask)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,avgSqrdLongtdChgAmt,std_evnt_drtn,avgLongtdChgFrq,avg_evnt_drtn,num_of_evnts,device_model,phone_brand
0,0.37801,0.012329,0.081663,0.028765,0.235807,MI 2,小米
1,0.37801,0.012329,0.081663,0.028765,0.235807,MI 2,小米
2,0.0,0.0,0.0,0.0,1.0,MI 2,小米


In [22]:
eventsData_test =gender_age_test.merge(eventBasedAggregates, how='left',on='device_id')
imputeDFColsUsingMean(eventsData_test,eventBasedAggregatesFeats)
eventsData_test =eventsData_test.merge(phone_brand_master[['phone_brand','device_id']], how='left',on='device_id')
eventsData_test =eventsData_test.merge(phone_brand_master[['device_model','device_id']], how='left',on='device_id')
eventsData_test_final=eventsData_test[modelFeatures]
for feat in eventBasedAggregatesFeats:
    scaleFeature(eventsData_test_final,feat)
featured_model_test=OneHotEncodeFeats(eventsData_test_final,['phone_brand','device_model'],categorical_features_mask)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  flag = np.concatenate(([True], aux[1:] != aux[:-1]))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Total Cols  Index(['avgSqrdLongtdChgAmt', 'std_evnt_drtn', 'avgLongtdChgFrq',
       'avg_evnt_drtn', 'num_of_evnts', 'device_model', 'phone_brand'],
      dtype='object')
Shape  (112071, 7)


In [24]:
featured_model_test.shape,featured_model_train.shape

((112071, 7), (74645, 1563))

(74645, 1563) (74645, 7)
[ 0.          0.          0.          0.          0.          0.          0.
  0.          0.01232859  0.08166277  0.37801029  0.23580696  0.02876541]


Unnamed: 0,num_of_evnts,avg_evnt_drtn,std_evnt_drtn,avgLongtdChgFrq,avgSqrdLongtdChgAmt,phone_brand,device_model
0,0.012329,0.081663,0.37801,0.235807,0.028765,47,677


In [14]:
help(enCodeFeats)

Help on coo_matrix in module scipy.sparse.coo object:

class coo_matrix(scipy.sparse.data._data_matrix, scipy.sparse.data._minmax_mixin)
 |  A sparse matrix in COOrdinate format.
 |  
 |  Also known as the 'ijv' or 'triplet' format.
 |  
 |  This can be instantiated in several ways:
 |      coo_matrix(D)
 |          with a dense matrix D
 |  
 |      coo_matrix(S)
 |          with another sparse matrix S (equivalent to S.tocoo())
 |  
 |      coo_matrix((M, N), [dtype])
 |          to construct an empty matrix with shape (M, N)
 |          dtype is optional, defaulting to dtype='d'.
 |  
 |      coo_matrix((data, (i, j)), [shape=(M, N)])
 |          to construct from three arrays:
 |              1. data[:]   the entries of the matrix, in any order
 |              2. i[:]      the row indices of the matrix entries
 |              3. j[:]      the column indices of the matrix entries
 |  
 |          Where ``A[i[k], j[k]] = data[k]``.  When shape is not
 |          specified, it is infe

In [29]:
eventsData_test =gender_age_test.merge(eventBasedAggregates, how='left',on='device_id')
imputeDFColsUsingMean(eventsData_test,eventBasedAggregatesFeats)
eventsData_test_final=eventsData_test[eventBasedAggregatesFeats]
eventsData_test_final=scaleFeatures(eventsData_test_final,eventBasedAggregatesFeats)

In [35]:
eventsData_train =eventsData_train.merge(phone_brand_master[['phone_brand']], how='left',on='device_id')
#eventsData_train =eventsData_train.merge(phone_brand_master[['device_model']], how='left',on='device_id')
#phone_brand_master[['device_model']]

KeyError: 'device_id'

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range=(1,2),min_df=0.0)
vect_matrix = vectorizer.fit_transform(phone_brand_master['brand_model'])
test_vect_matrix = vectorizer.transform(phone_brand_master_test['brand_model'])

In [13]:
def validateModel(X,y, model):
    kf = KFold(X.shape[0], n_folds=5, shuffle=True, random_state=0)
    for itrain, itest in kf:
        if type(X)==type(pd.DataFrame()):
            train=X.ix[itrain]
            test=X.ix[itest]
        else:
            train = X[itrain,:]
            test = X[itest,:]
        ytrain, ytest = y[itrain], y[itest]
        clf = model.fit(train,ytrain)
        ypred = clf.predict_proba(test)
        print(ypred.shape)
        print(log_loss(ytest, ypred))
        
def getModelOutput(X,y,X2, model):
    kf = KFold(X.shape[0], n_folds=5, shuffle=True, random_state=0)
    for itrain, itest in kf:
        if type(X)==type(pd.DataFrame()):
            train=X.ix[itrain]
            test=X.ix[itest]
        else:
            train = X[itrain,:]
            test = X[itest,:]
        ytrain, ytest = y[itrain], y[itest]
        clf = model.fit(train,ytrain)
        ypred = clf.predict_proba(X2)
        return ypred

In [14]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.svm  import LinearSVC
#class CalibModel(object):
class CalibModel(object):
    def __init__(self,clf):
        #clf = MultinomialNB()
        print("Obtained Classifier Instance ",clf)
        self.clf = CalibratedClassifierCV(clf, cv=2, method='sigmoid')
    
    def fit(self, X, y):
        self.clf.fit(X,y)
        return self
    
    def predict(self, X):
        return self.clf.predict(X)
    
    def predict_proba(self, X):
        return self.clf.predict_proba(X)

In [15]:
validateModel(vect_matrix, phone_brand_master['group'], CalibModel(MultinomialNB()))
#validateModel(vect_matrix, phone_brand_master['group'], CalibModel(LinearSVC()))
validateModel(eventsData_train_final,eventsData_train.group,CalibModel(GaussianNB()))

Obtained Classifier Instance  MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
(14929, 12)
2.40430254546
(14929, 12)
2.40689362128
(14929, 12)
2.3995671297
(14929, 12)
2.40416462513
(14929, 12)
2.4056362231
Obtained Classifier Instance  GaussianNB()
(14929, 12)
2.42672411397
(14929, 12)
2.43003467875
(14929, 12)
2.42232043542
(14929, 12)
2.42918764186
(14929, 12)
2.42587252809


In [35]:
validateModel(vect_matrix, phone_brand_master['group'], CalibModel(Dec))

In [10]:
targetencoder = LabelEncoder().fit(phone_brand_master.group)
y = targetencoder.transform(phone_brand_master.group)
pred = pd.DataFrame(getModelOutput(vect_matrix, phone_brand_master['group'], test_vect_matrix, CalibModel()), index = phone_brand_master_test.device_id, columns=targetencoder.classes_)
pred.to_csv('submit_1.csv',index=True)