In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score
from sklearn.feature_extraction.text import HashingVectorizer, CountVectorizer
from sklearn import manifold, decomposition, linear_model, ensemble, neighbors, cross_validation
%matplotlib inline
import seaborn as sns
import sklearn
import matplotlib.pyplot as plt
import xgboost

from heamy.dataset import Dataset
from heamy.estimator import Regressor, Classifier
from heamy.pipeline import ModelsPipeline




### будем использовать в качестве метрики RMSE

In [2]:
def rmse(x,y):
    return np.mean((x - y) ** 2) ** 0.5

### загрузка данных url_domain_train. Пока работаем только с ними

In [None]:
urls_train_df = pd.read_csv('data/url_domain_train.csv', header=None, delimiter='\t')

In [None]:
urls_train_df.columns = ['id', 'url', 'count']
urls_train_df.head()

### нужно учесть count

In [None]:
tmp = urls_train_df
tmp['url'] = tmp['url'].astype('str')
tmp['space'] = ' '
tmp['array_of_url'] = (tmp['url'] + tmp['space']).apply(lambda x: x.split(' ')[:-1]) * tmp['count']
tmp.drop(['space'], axis = 1, inplace=True)
tmp.head()

In [None]:
tmp.array_of_url[1]

In [None]:
URLS = pd.DataFrame(tmp.groupby('id').array_of_url.apply(lambda x: x.tolist()))

In [None]:
URLS[70:80]

In [None]:
URLS.array_of_url['000000014AA6339999A62EB2059F4601']

In [4]:
def f(x):
    a = []
    for i in range(len(x)):
        for j in range(len(x[i])):
            a.append(x[i][j])
    return a

In [None]:
f(URLS.array_of_url['000000014AA6339999A62EB2059F4601'])

In [None]:
URLS['list'] = URLS.array_of_url.apply(f)
URLS.drop(['array_of_url'], axis = 1, inplace=True)

In [None]:
URLS.head()

In [None]:
URLS['id'] = URLS.index
URLS.index = range(len(URLS))
URLS.columns = ['urls', 'id']
URLS.head()

In [3]:
age_train_df = pd.read_csv('data/age_profile_train.csv', header=None, delimiter='\t')
age_train_df.columns = ['id', 'age']
age_train_df.head()

Unnamed: 0,id,age
0,000000013CB5719C0000A2C90002C101,53
1,00000001442BE24000001B7D00F50801,48
2,00000001448580F800003F1B31FB0901,28
3,0000000145BDB2FF000157971645E901,44
4,000000014602771F0000DB9359714C01,48


In [None]:
train_df = URLS.merge(age_train_df, on='id', how='left')
train_df.head()

In [None]:
print train_df.shape

### начнём делать кросс-валидацию и обработку признаков

In [None]:
X, y = train_df.urls.values, train_df.age.values

In [None]:
X = map(lambda x: ' '.join(x), X)

In [None]:
N_FEATURES_HW = 1800
N_COMPONENTS = 140

In [None]:
hw = HashingVectorizer(n_features=N_FEATURES_HW, non_negative=True, ).fit(X)
X_to_den = hw.transform(X).todense()
transformer = sklearn.decomposition.NMF(n_components=N_COMPONENTS)
X_nmf = transformer.fit_transform(X_to_den)

In [None]:
score = []
n_estim = [500, 600, 700, 800]
for n in n_estim:
    mean = []
    for i in range(5):

    
        train_data, test_data, y_train, y_test = cross_validation.train_test_split(X_nmf, y, \
                                                                     test_size = 0.3)
    
        xgb_rg = xgboost.XGBRegressor(n_estimators=n, learning_rate=0.03)
        xgb_rg.fit(train_data, y_train)
        print 'xgboost_rmse = ', rmse(xgb_rg.predict(test_data), y_test)
        mean.append(rmse(xgb_rg.predict(test_data), y_test))
    score.append(np.mean(mean))
    

In [None]:
score

In [None]:
score

### параметры xgboost: n_est = 550

In [None]:
reg = LinearRegression()
reg.fit(X_nmf[itrain, :], y[itrain])

pred_X_nmf = reg.predict(X_nmf[itest, :])
print rmse(y[itest], pred_X_nmf)

In [None]:
error = []
for w in np.linspace(0, 1, 100):
     error += [rmse(y[itest], pred_X * w + (1-w) * pred_X_nmf)]

pd.Series(error, index=np.linspace(0, 1, 100)).plot(figsize=(6,3))
print min(error)

In [None]:
from sklearn import pipeline, preprocessing, feature_extraction

In [None]:
# обычный датасет
dataset1 = Dataset(X[itrain, :], y[itrain], X[itest, :])
# NMF
dataset2 = Dataset(X_nmf[itrain, :], y[itrain], X_nmf[itest, :])

# линейная модель на обычных данных
lr = Regressor(dataset=dataset1, 
                     estimator=linear_model.LinearRegression,
                     parameters={'normalize': True},
                     name='lr')

# линейная модель на TFIDF от обычных данных
def tfidf_lr_(X_train, y_train, X_test, y_test=None, random_state=8888):
    model = pipeline.Pipeline([('tfidf', feature_extraction.text.TfidfTransformer()), 
                                ('linear_model', linear_model.LinearRegression())])
    model.fit(X_train, y_train)
    return model.predict(X_test)

tfidf_lr = Regressor(dataset=dataset1,
                     estimator=tfidf_lr_,
                     name='tfidf_lr')

# ExtraTrees на NMF
rf = Regressor(dataset=dataset2, 
                     estimator=ensemble.ExtraTreesRegressor,
                     parameters={'n_estimators': 100, 'n_jobs': -1},
                     name='rf')

# Stack two models
# Returns new dataset with out-of-fold predictions
meta_pipeline = ModelsPipeline(lr, tfidf_lr, rf)
stack_ds = meta_pipeline.blend(proportion=0.2,seed=111)

# Train LinearRegression on stacked data (second stage)
stacker = Regressor(dataset=stack_ds, estimator=LinearRegression)
results = stacker.predict()

print rmse(y[itest], results)

### Теперь загрузим title_unify_train

In [5]:
title_train = pd.read_csv('data/title_unify_train.csv', header=None, sep='\t')
title_train.head()

Unnamed: 0,0,1,2
0,000000014B6D41C13D777E8314725401,коляна лента прикол,1
1,0000000150707ACB8A82451C0307BC01,candi410 rambler ru входящая рамблер-почта,1
2,0000000150707ACB8A82451C0307BC01,cosmopolitan витамин волос для женщина журнал ...,1
3,0000000150707ACB8A82451C0307BC01,realbox бокс для интернет-магазин страница тов...,1
4,0000000150707ACB8A82451C0307BC01,realbox бокс для интернет-магазин товар экипир...,2


In [6]:
title_train.columns = ['id', 'unify', 'count']

In [7]:
tmp = title_train
tmp['unify'] = tmp['unify'].astype('str')
tmp['space'] = ' '
tmp['array_of_unify'] = (tmp['unify'] + tmp['space']).apply(lambda x: x.split(' ')[:-1]) * tmp['count']
tmp.drop(['space'], axis = 1, inplace=True)
tmp.head()

Unnamed: 0,id,unify,count,array_of_unify
0,000000014B6D41C13D777E8314725401,коляна лента прикол,1,"[коляна, лента, прикол]"
1,0000000150707ACB8A82451C0307BC01,candi410 rambler ru входящая рамблер-почта,1,"[candi410, rambler, ru, входящая, рамблер-почта]"
2,0000000150707ACB8A82451C0307BC01,cosmopolitan витамин волос для женщина журнал ...,1,"[cosmopolitan, витамин, волос, для, женщина, ж..."
3,0000000150707ACB8A82451C0307BC01,realbox бокс для интернет-магазин страница тов...,1,"[realbox, бокс, для, интернет-магазин, страниц..."
4,0000000150707ACB8A82451C0307BC01,realbox бокс для интернет-магазин товар экипир...,2,"[realbox, бокс, для, интернет-магазин, товар, ..."


In [8]:
UNIFY = pd.DataFrame(tmp.groupby('id').array_of_unify.apply(lambda x: x.tolist()))

In [9]:
UNIFY['list'] = UNIFY.array_of_unify.apply(f)
UNIFY.drop(['array_of_unify'], axis = 1, inplace=True)

In [10]:
UNIFY['id'] = UNIFY.index
UNIFY.index = range(len(UNIFY))
UNIFY.columns = ['unify', 'id']
UNIFY.head()

Unnamed: 0,unify,id
0,"[бесплатный, надёжный, почта, рамблер, электро...",000000013CB5719C0000A2C90002C101
1,"[24-х, 34-х, до, договор, неделя, новость, пре...",00000001442BE24000001B7D00F50801
2,"[авто, бош, контакт, королёв, сервис, авто, бо...",00000001448580F800003F1B31FB0901
3,"[ua, втрать, війни, донбасі, за, на, новини, о...",0000000145BDB2FF000157971645E901
4,"[black, walnut, грецкий, орех, чёрный, inmomen...",000000014602771F0000DB9359714C01


In [11]:
train_un = UNIFY.merge(age_train_df, on='id', how='left')
train_un.head()

Unnamed: 0,unify,id,age
0,"[бесплатный, надёжный, почта, рамблер, электро...",000000013CB5719C0000A2C90002C101,53
1,"[24-х, 34-х, до, договор, неделя, новость, пре...",00000001442BE24000001B7D00F50801,48
2,"[авто, бош, контакт, королёв, сервис, авто, бо...",00000001448580F800003F1B31FB0901,28
3,"[ua, втрать, війни, донбасі, за, на, новини, о...",0000000145BDB2FF000157971645E901,44
4,"[black, walnut, грецкий, орех, чёрный, inmomen...",000000014602771F0000DB9359714C01,48


In [14]:
X, y = train_un.unify.values, train_un.age.values

In [13]:
X = map(lambda x: ' '.join(x), X)
hw = HashingVectorizer(n_features=1500, non_negative=True).fit(X)
X_to_den = hw.transform(X).todense()

In [15]:
%%time

transformer = sklearn.decomposition.NMF(n_components=100)
X_nmf = transformer.fit_transform(X_to_den)

CPU times: user 16min 35s, sys: 24 s, total: 16min 59s
Wall time: 7h 46min 50s


In [None]:
itrain, itest = cross_validation.train_test_split(range(len(X)), test_size=1./3, random_state=0)
len(itrain), len(itest)

In [None]:
rmse(y[itest], np.mean(y[itrain]))

In [None]:
reg = LinearRegression()
reg.fit(X[itrain, :], y[itrain])

pred_X = reg.predict(X[itest, :])
print rmse(y[itest], pred_X)

In [None]:
reg = LinearRegression()
reg.fit(X_nmf[itrain, :], y[itrain])

pred_X_nmf = reg.predict(X_nmf[itest, :])
print rmse(y[itest], pred_X_nmf)

In [None]:
error = []
for w in np.linspace(0, 1, 100):
     error += [rmse(y[itest], pred_X * w + (1-w) * pred_X_nmf)]

pd.Series(error, index=np.linspace(0, 1, 100)).plot(figsize=(6,3))
print min(error)

In [None]:
# обычный датасет
dataset1 = Dataset(X[itrain, :], y[itrain], X[itest, :])
# NMF
dataset2 = Dataset(X_nmf[itrain, :], y[itrain], X_nmf[itest, :])

# линейная модель на обычных данных
lr = Regressor(dataset=dataset1, 
                     estimator=linear_model.LinearRegression,
                     parameters={'normalize': True},
                     name='lr')

# линейная модель на TFIDF от обычных данных
def tfidf_lr_(X_train, y_train, X_test, y_test=None, random_state=8888):
    model = pipeline.Pipeline([('tfidf', feature_extraction.text.TfidfTransformer()), 
                                ('linear_model', linear_model.LinearRegression())])
    model.fit(X_train, y_train)
    return model.predict(X_test)

tfidf_lr = Regressor(dataset=dataset1,
                     estimator=tfidf_lr_,
                     name='tfidf_lr')

# ExtraTrees на NMF
rf = Regressor(dataset=dataset2, 
                     estimator=ensemble.ExtraTreesRegressor,
                     parameters={'n_estimators': 100, 'n_jobs': -1},
                     name='rf')

# Stack two models
# Returns new dataset with out-of-fold predictions
meta_pipeline = ModelsPipeline(lr, tfidf_lr, rf)
stack_ds = meta_pipeline.blend(proportion=0.2,seed=111)

# Train LinearRegression on stacked data (second stage)
stacker = Regressor(dataset=stack_ds, estimator=LinearRegression)
results = stacker.predict()

print rmse(y[itest], results)

### данные для отправки

In [None]:
urls_test_df = pd.read_csv('data/url_domain_test', header=None, delimiter='\t')
urls_test_df.columns = ['id', 'url', 'count']
urls_test_df.id.nunique()

In [None]:
urls_test_df.head()

In [None]:
tmp = urls_test_df
tmp['url'] = tmp['url'].astype('str')
tmp['space'] = ' '
tmp['array_of_url'] = (tmp['url'] + tmp['space']).apply(lambda x: x.split(' ')[:-1]) * tmp['count']
tmp.drop(['space'], axis = 1, inplace=True)
tmp.head()

In [None]:
TEST_URLS = pd.DataFrame(tmp.groupby('id').array_of_url.apply(lambda x: x.tolist()))

In [None]:
TEST_URLS['list'] = TEST_URLS.array_of_url.apply(f)
TEST_URLS.drop(['array_of_url'], axis = 1, inplace=True)

In [None]:
TEST_URLS['id'] = TEST_URLS.index
TEST_URLS.index = range(len(TEST_URLS))
TEST_URLS.columns = ['urls', 'id']
TEST_URLS.head()

In [None]:
print urls_test_df.shape
print TEST_URLS.shape

In [None]:
print urls_test_df.id.nunique()
print TEST_URLS.id.nunique()