In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import KFold
from sklearn.metrics import log_loss

In [2]:
app_events = pd.read_csv('../Data/app_events.csv')
app_labels = pd.read_csv('../Data/app_labels.csv')
events = pd.read_csv('../Data/events.csv')
gender_age_train = pd.read_csv('../Data/gender_age_train.csv')
gender_age_test = pd.read_csv('../Data/gender_age_test.csv')
label_categories = pd.read_csv('../Data/label_categories.csv')
phone_brand_device_model = pd.read_csv('../Data/phone_brand_device_model.csv',encoding='utf-8')
phone_brand_device_model = phone_brand_device_model.drop_duplicates('device_id',keep='first')

In [3]:
#Does brand name conversion matter?
#Seems not, other than for EDA.
#https://www.kaggle.com/c/talkingdata-mobile-user-demographics/forums/t/22270/do-brand-and-phone-names-really-matter/
#Prior work : https://www.kaggle.com/c/talkingdata-mobile-user-demographics/forums/t/22186/chinese-brands
#Planning to try Google Translation API for conversion [ comes free with Google cloud signup ]
#Less Priority as of now

In [4]:
phone_brand_master = gender_age_train.merge(phone_brand_device_model[['device_id','phone_brand','device_model']], how='left',on='device_id')
phone_brand_master['brand_model'] = phone_brand_master['phone_brand'] + ' ' + phone_brand_master['device_model']
#Above brand and model can be considered separately[Uniq Brands:131,Uniq Models:1599]
phone_brand_master.brand_model.fillna('',inplace=True)

In [5]:
phone_brand_master_test = gender_age_test.merge(phone_brand_device_model[['device_id','phone_brand','device_model']], how='left',on='device_id')
phone_brand_master_test['brand_model'] = phone_brand_master_test['phone_brand'] + ' ' + phone_brand_master_test['device_model']
phone_brand_master_test.brand_model.fillna('',inplace=True)
#Till Now, Have only used 2 features (Phone Brand, Device Model)

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(ngram_range=(1,2),min_df=0.0)
vect_matrix = vectorizer.fit_transform(phone_brand_master['brand_model'])
test_vect_matrix = vectorizer.transform(phone_brand_master_test['brand_model'])

In [7]:
def validateModel(X,y, model):
    kf = KFold(X.shape[0], n_folds=5, shuffle=True, random_state=0)
    for itrain, itest in kf:
        train = X[itrain,:]
        test = X[itest,:]
        ytrain, ytest = y[itrain], y[itest]
        clf = model.fit(train,ytrain)
        ypred = clf.predict_proba(test)
        print(ypred.shape)
        print(log_loss(ytest, ypred))
        
def getModelOutput(X,y,X2, model):
    kf = KFold(X.shape[0], n_folds=5, shuffle=True, random_state=0)
    for itrain, itest in kf:
        train = X[itrain,:]
        test = X[itest,:]
        ytrain, ytest = y[itrain], y[itest]
        clf = model.fit(train,ytrain)
        ypred = clf.predict_proba(X2)
        return ypred
        

In [8]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.naive_bayes import MultinomialNB

class CalibModel(object):
    def __init__(self):
        clf = MultinomialNB()
        self.clf = CalibratedClassifierCV(clf, cv=2, method='sigmoid')
    
    def fit(self, X, y):
        self.clf.fit(X,y)
        return self
    
    def predict(self, X):
        return self.clf.predict(X)
    
    def predict_proba(self, X):
        return self.clf.predict_proba(X)

In [9]:
validateModel(vect_matrix, phone_brand_master['group'], CalibModel())

(14929, 12)
2.40430254546
(14929, 12)
2.40689362128
(14929, 12)
2.3995671297
(14929, 12)
2.40416462513
(14929, 12)
2.4056362231


In [10]:
targetencoder = LabelEncoder().fit(phone_brand_master.group)
y = targetencoder.transform(phone_brand_master.group)
pred = pd.DataFrame(getModelOutput(vect_matrix, phone_brand_master['group'], test_vect_matrix, CalibModel()), index = phone_brand_master_test.device_id, columns=targetencoder.classes_)
pred.to_csv('submit_1.csv',index=True)