<h1>Example 1: User360 Demographic Prediction Hackathon</h1>

<p>

In [1]:
import os
import csv
import math

import pandas as pd
import numpy as np
import random
import scipy as sp
import re
import pickle


from sklearn.svm import LinearSVC
from sklearn.cluster import KMeans
import sklearn.cross_validation as cv
from sklearn import cross_validation
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import Normalizer

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

In [4]:
train_s = pd.read_csv("hack360_train_summary_v2.csv")
cols = ['anon_user_id','NormalizedQuery','OSProductName', 'DeviceType','UserCEFSegment','IPCountry','IPPostalCode', 'IPCity','IPState']
train_f = pd.read_csv("hack360_train_full.csv",usecols=cols)
train_f['IPPostalCode']=train_f['IPPostalCode'].apply(lambda x:str(x))
train_s.fillna(0, inplace=True)
train_f.fillna('', inplace=True)
for c in ['IPCountry','IPCity']:
    train_f[c]=train_f[c].apply(lambda x:re.sub("\s","",x))
fts=['NormalizedQuery','OSProductName','UserCEFSegment','DeviceType','IPCountry','IPPostalCode','IPCity','IPState']
train_f['NormalizedQuery']=train_f[fts].apply(lambda x:re.sub("\n","",' '.join(x)), axis=1)
train_f = train_f.groupby('anon_user_id').agg(lambda x:' '.join(x))
train = pd.merge(train_s, train_f, left_on="anon_user_id", right_index=True)
train.to_csv("Hack360_train.csv", index=False)

f =  open("Hack360_Qcols.pk", 'wb')
qcols = []
for c in train_s.columns:
    if re.match('sum_Query.*',c)!=None:
        qcols.append(c)
pickle.dump(qcols,f)
f.close()
Qr = train[qcols].values.astype(float)


nr = Normalizer()
Qr = nr.fit_transform(Qr)
f =  open("Hack360_Normalize.pk", 'wb')
pickle.dump(nr,f)
f.close()

km = KMeans(n_clusters=32)
train['ct'] = km.fit_predict(Qr)
f =  open("Hack360_Kmeans.pk", 'wb')
pickle.dump(km, f)
f.close()

enc = OneHotEncoder()
enc.fit(train[['ct']].values)
f = open("Hack360_Encoder.pk", 'wb')
pickle.dump(enc, f)
f.close()

cv1 = TfidfVectorizer(min_df=0.0001, binary=True)
X_tr = sp.sparse.hstack((enc.transform(train[['ct']].values), cv1.fit_transform(train.NormalizedQuery)), format='csr')
f = open("Hack360_TfidfVectorizer.pk", 'wb')
pickle.dump(cv1, f)
f.close()

Y_tr = train['target_Gender'].values
ch2 = SelectKBest(chi2, k=30000)
X_tr = ch2.fit_transform(X_tr, Y_tr)
f = open("Hack360_Ch2.pk", 'wb')
pickle.dump(ch2, f)
f.close()

clf_y = LinearSVC(C=0.3,random_state=0, class_weight='auto')
clf_y.fit(X_tr, Y_tr)
f = open("Hack360_LibSVM.pk", 'wb')
pickle.dump(clf_y, f)
f.close()


In [5]:
test_s = pd.read_csv("hack360_test_summary_v2.csv")
cols = ['anon_user_id','NormalizedQuery','OSProductName', 'DeviceType','UserCEFSegment','IPCountry','IPPostalCode', 'IPCity','IPState']
test_f = pd.read_csv("hack360_test_full.csv",usecols=cols)
test_f['IPPostalCode']=test_f['IPPostalCode'].apply(lambda x:str(x))
test_s.fillna(0, inplace=True)
test_f.fillna('', inplace=True)
for c in ['IPCountry','IPCity']:
    test_f[c]=test_f[c].apply(lambda x:re.sub("\s","",x))
fts=['NormalizedQuery','OSProductName','UserCEFSegment','DeviceType','IPCountry','IPPostalCode','IPCity','IPState']
test_f['NormalizedQuery']=test_f[fts].apply(lambda x:' '.join(x), axis=1)
test_f = test_f.groupby('anon_user_id').agg(lambda x:' '.join(x))
test = pd.merge(test_s, test_f, left_on="anon_user_id", right_index=True)
test.to_csv("Hack360_test.csv", index=False)

qcols=[]
for c in train_s.columns:
    if re.match('sum_Query.*',c)!=None:
        qcols.append(c)
Qt = test[qcols].values.astype(float)

f = open("Hack360_Normalize.pk", 'rb')
nr_t = pickle.load(f)
f.close()
Qt = nr_t.transform(Qt)

f = open("Hack360_Kmeans.pk", 'rb')
km_t = pickle.load(f)
f.close()
test['ct'] = km_t.predict(Qt)

f = open("Hack360_Encoder.pk", 'rb')
enc_t = pickle.load(f)
f.close()

f = open("Hack360_TfidfVectorizer.pk", 'rb')
cv1_t = pickle.load(f)
f.close()

X_te = sp.sparse.hstack((enc_t.transform(test[['ct']].values), cv1_t.transform(test.NormalizedQuery)), format='csr')

f = open("Hack360_Ch2.pk", 'rb')
ch2_t = pickle.load(f)
f.close()
X_te = ch2_t.transform(X_te)

f = open("Hack360_LibSVM.pk", 'rb')
clf1_y_t = pickle.load(f)
f.close()
y_p = clf1_y_t.predict(X_te)


In [8]:
ground_t = pd.read_csv("hack360_ground_truth.csv")
df = pd.DataFrame(test.anon_user_id)
df = pd.merge(df, ground_t, on="anon_user_id")
y_t = df[df['is_train']==0]['target_Gender']
print accuracy_score(y_t, y_p)

y_actu = pd.Series(y_t, name='Actual')
y_pred = pd.Series(y_p, name='Predicted')
df_confusion = pd.crosstab(y_actu, y_pred, rownames=['Actual'], colnames=['Predicted'])
df_conf_norm = df_confusion / df_confusion.sum(axis=1)
df_conf_norm

0.722192936215


Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.65808,0.279962
1,0.275176,0.774688
