In [110]:
%matplotlib inline

import numpy as np
from pandas import read_csv
import pandas as pd
import json
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import matplotlib.pyplot as plot
import seaborn

In [2]:
path = "../data/uid_domains_full.csv"
data = read_csv(path, delimiter='\t')
data['domains'] = [','.join(json.loads(i)) for i in data['domains']]

In [3]:
data.head()

Unnamed: 0,uid,gender,age,domains
0,d50192e5-c44e-4ae8-ae7a-7cfe67c8b777,F,18-24,"news.yandex.ru,sotovik.ru,zebra-zoya.ru"
1,d502331d-621e-4721-ada2-5d30b2c3801f,M,25-34,"rsdn.ru,citieslist.ru,rutv.ru,interfax.ru,vand..."
2,d50237ea-747e-48a2-ba46-d08e71dddfdb,F,25-34,"ru.oriflame.com,adme.ru,cdn.etgdta.com,povar.r..."
3,d502f29f-d57a-46bf-8703-1cb5f8dcdf03,F,25-34,"1obl.ru,nadietah.ru,translate-tattoo.ru"
4,d503c3b2-a0c2-4f47-bb27-065058c73008,M,>=55,"fedpress.ru,eleks.ru,prommetizcomplect.ru,ramb..."


In [65]:
test_data = data[(data['gender'] == '-') & (data['age'] == '-')]
train_data = data[~((data['gender'] == '-') & (data['age'] == '-'))]
train_data = train_data.sort_values(by=['age'])
train_data['age'] = pd.factorize(train_data.age)[0]
train_data['gender'] = pd.factorize(train_data.gender)[0]

In [66]:
train_data.head(2)

Unnamed: 0,uid,gender,age,domains
0,d50192e5-c44e-4ae8-ae7a-7cfe67c8b777,0,0,"news.yandex.ru,sotovik.ru,zebra-zoya.ru"
4956,0a00e0d0-cf5f-458c-855c-9fbae366c063,0,0,"jv.ru,photogoroda.com,shougolos.ru,gsconto.com..."


In [67]:
text_all = train_data['domains'].values.tolist()

In [68]:
count_vec = CountVectorizer(tokenizer=lambda x: [i for i in x.split(',') if '.ru' in i], 
                            max_features=10000) 

In [69]:
matrix_count = count_vec.fit_transform(text_all)
matrix = matrix_count.toarray()

In [70]:
words = [x[0] for x in sorted(count_vec.vocabulary_.items(), key=lambda x: x[1])]

In [71]:
matrix_df = pd.DataFrame(matrix, columns=words)

In [72]:
matrix_df['uid'] = data['uid']
matrix_df = pd.merge(matrix_df, train_data, on='uid').drop(['domains'], axis=1)
# matrix_df = pd.get_dummies(matrix_df, columns=['gender'])

In [73]:
matrix_df.head(2)

Unnamed: 0,000a.ru,008.ru,03-ektb.ru,09irk.ru,0lik.ru,0ve.ru,0zd.ru,1-dream.ru,1-pp.ru,10-casino.ru,...,zvezdi.mirtesen.ru,zvezdi.ru,zvukobook.ru,zvukomaniya.ru,zxcc.ru,zzap.ru,zzombi.ru,uid,gender,age
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,d50192e5-c44e-4ae8-ae7a-7cfe67c8b777,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,d502331d-621e-4721-ada2-5d30b2c3801f,1,1


In [17]:
# matrix_df = matrix_df.drop(['gender_F'], axis=1)

In [18]:
# matrix_df.to_csv('../data/matrix_df.csv', sep='\t')

In [19]:
# test_matrix = matrix_df[matrix_df['uid'].isin(test_data['uid'].values.tolist())]

In [20]:
# train_matrix = matrix_df[~(matrix_df['uid'].isin(test_data['uid'].values.tolist()))]

In [74]:
# Y_target = train_matrix['gender_M']
# Y_target = matrix_df['gender_M']
Y_target = matrix_df['age']

In [75]:
# X_target = train_matrix.drop(['uid', 'age_-', 'age_18-24', 'age_25-34', 'age_35-44', 'age_45-54', 'age_>=55'], axis=1)
# X_target = matrix_df.drop(['uid', 'age_18-24', 'age_25-34', 'age_35-44', 'age_45-54', 'age_>=55'], axis=1)
X_target = matrix_df.drop(['uid'], axis=1)


In [148]:
def cross_valid(matrix_counts, target, test_size=0.3, random_state=0):
    return train_test_split(matrix_counts, target, test_size=test_size, random_state=random_state)

In [114]:
def roc_curve(title, Y_test, predicted, label='?', pos_label=0):
    fpr, tpr, th = metrics.roc_curve(Y_test, predicted, pos_label=pos_label)
    plot.plot(fpr, tpr, label=label);
    plot.xlabel('False Positive'); 
    plot.ylabel('True Positive'); 
    plot.title(title)
    plot.legend(bbox_to_anchor=(1, 1), loc=2); 
    plot.xlim([0.0, 1.0]); plot.ylim([0.0, 1]);
    print 'Logistic Regression (', label, ') AUC = ', metrics.auc(fpr, tpr)

In [78]:
X_train, X_test, Y_train, Y_test = cross_valid(X_target, Y_target, random_state=5)

In [126]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import metrics
import numpy

In [80]:
model1 = RandomForestClassifier(n_estimators=100, n_jobs=23)
model1.fit(X_train, Y_train)
# model2 = MultinomialNB(alpha=1)
# model2.fit(X_train, Y_train)
# model3 = LogisticRegression(C=1.0, penalty='l2')
# model3.fit(X_train, Y_train)
# model4 = SVC()
# model4.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=23, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

In [81]:
predict1 = model1.predict_proba(X_test)
# predict2 = model2.predict_proba(X_test)
# predict3 = model3.predict_proba(X_test)
# predict4 = model4.predict_proba(X_test)

In [204]:
predict_ = pd.DataFrame([sum([i * int(j == m) for i, j in enumerate(p)])
                         for p, m in ((p, max(p)) 
                                      for p in predict1)],
                        columns=['predicted_age']
                       )
predict_.index = Y_test.index
predict_['Y_test'] = Y_test.values

In [205]:
predict_['is_correct'] = predict_['Y_test'] == predict_['predicted_age']

In [213]:
len(predict_[predict_['is_correct'] == True]), len(predict_)

(10782, 10842)

In [121]:
for i in range(5):
    roc_curve('log reg, ROC', Y_test, predict_[:, i], label='RandomForestClassifier %s' % i, pos_label=i)
# roc_curve('log reg, ROC', Y_test, predict2, label='MultinomialNB')
# roc_curve('log reg, ROC', Y_test, predict3, label='LogisticRegression')
# roc_curve('log reg, ROC', Y_test, predict4, label='SVC')

TypeError: list indices must be integers, not tuple