In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score
from sklearn.feature_extraction.text import HashingVectorizer, CountVectorizer
from sklearn import manifold, decomposition, linear_model, ensemble, neighbors, cross_validation
%matplotlib inline
import seaborn as sns
import sklearn
import matplotlib.pyplot as plt
import xgboost
import re
import nltk
import pymorphy2

from heamy.dataset import Dataset
from heamy.estimator import Regressor, Classifier
from heamy.pipeline import ModelsPipeline




### будем использовать в качестве метрики RMSE

In [2]:
def rmse(x,y):
    return np.mean((x - y) ** 2) ** 0.5

### загрузка данных url_domain_train. Пока работаем только с ними

In [3]:
urls_train_df = pd.read_csv('data/url_domain_train.csv', header=None, delimiter='\t')

In [4]:
urls_train_df.columns = ['id', 'url', 'count']
urls_train_df.head()

Unnamed: 0,id,url,count
0,000000014B60815F65B38258011B6C01,login.rutracker.org,1
1,000000014B60815F65B38258011B6C01,rutracker.org,4
2,000000014C03DA2A47AC433A0C755201,admin.tour-spb.net,1
3,000000014C03DA2A47AC433A0C755201,czinfo.ru,1
4,000000014C03DA2A47AC433A0C755201,forumsostav.ru,1


### нужно учесть count

#### попробуем разбивать url'ы на части: login.rutracker.org $\to$ 'login rutracker org'

In [5]:
sep_tmp = urls_train_df
sep_tmp['url'] = sep_tmp['url'].astype('str')
sep_tmp['space'] = ' '
sep_tmp['array_of_url'] = (sep_tmp['url'] + sep_tmp['space']).apply(lambda x: re.findall(r"[\w']+",x)) * sep_tmp['count']
sep_tmp.drop(['space'], axis = 1, inplace=True)
sep_tmp.head()

Unnamed: 0,id,url,count,array_of_url
0,000000014B60815F65B38258011B6C01,login.rutracker.org,1,"[login, rutracker, org]"
1,000000014B60815F65B38258011B6C01,rutracker.org,4,"[rutracker, org, rutracker, org, rutracker, or..."
2,000000014C03DA2A47AC433A0C755201,admin.tour-spb.net,1,"[admin, tour, spb, net]"
3,000000014C03DA2A47AC433A0C755201,czinfo.ru,1,"[czinfo, ru]"
4,000000014C03DA2A47AC433A0C755201,forumsostav.ru,1,"[forumsostav, ru]"


In [6]:
sep_URLS = pd.DataFrame(sep_tmp.groupby('id').array_of_url.apply(lambda x: x.tolist()))
sep_URLS.head()

Unnamed: 0_level_0,array_of_url
id,Unnamed: 1_level_1
000000013CB5719C0000A2C90002C101,"[[id, rambler, ru], [mail, rambler, ru], [r0, ..."
00000001442BE24000001B7D00F50801,"[[1prime, ru], [autorambler, ru], [chellak, ru..."
00000001448580F800003F1B31FB0901,"[[bosch, korolev, ru]]"
0000000145BDB2FF000157971645E901,"[[aptekanizkihcen, ua], [colady, ru], [gorod, ..."
000000014602771F0000DB9359714C01,"[[astrorok, ru], [diets, ru], [edaplus, info],..."


#### без разбиения самих url'ов

In [None]:
tmp = urls_train_df
tmp['url'] = tmp['url'].astype('str')
tmp['space'] = ' '
tmp['array_of_url'] = (tmp['url'] + tmp['space']).apply(lambda x: x.split(' ')[:-1]) * tmp['count']
tmp.drop(['space'], axis = 1, inplace=True)
tmp.head()

In [None]:
URLS = pd.DataFrame(tmp.groupby('id').array_of_url.apply(lambda x: x.tolist()))

In [7]:
sep_URLS.head()

Unnamed: 0_level_0,array_of_url
id,Unnamed: 1_level_1
000000013CB5719C0000A2C90002C101,"[[id, rambler, ru], [mail, rambler, ru], [r0, ..."
00000001442BE24000001B7D00F50801,"[[1prime, ru], [autorambler, ru], [chellak, ru..."
00000001448580F800003F1B31FB0901,"[[bosch, korolev, ru]]"
0000000145BDB2FF000157971645E901,"[[aptekanizkihcen, ua], [colady, ru], [gorod, ..."
000000014602771F0000DB9359714C01,"[[astrorok, ru], [diets, ru], [edaplus, info],..."


In [None]:
URLS.head()

In [8]:
def sep_f(x):
    a = []
    for i in range(len(x)):
        for j in range(len(x[i])):
            a.append(x[i][j])
    return " ".join(a)

In [22]:
def f(x):
    a = []
    for i in range(len(x)):
        for j in range(len(x[i])):
            a.append(x[i][j])
    return a

In [None]:
URLS['list'] = URLS.array_of_url.apply(f)
URLS.drop(['array_of_url'], axis = 1, inplace=True)

In [9]:
sep_URLS['list'] = sep_URLS.array_of_url.apply(sep_f)
sep_URLS.drop(['array_of_url'], axis = 1, inplace=True)

In [10]:
URLS['id'] = URLS.index
URLS.index = range(len(URLS))
URLS.columns = ['urls', 'id']

NameError: name 'URLS' is not defined

In [11]:
sep_URLS['id'] = sep_URLS.index
sep_URLS.index = range(len(sep_URLS))
sep_URLS.columns = ['urls', 'id']

In [12]:
age_train_df = pd.read_csv('data/age_profile_train.csv', header=None, delimiter='\t')
age_train_df.columns = ['id', 'age']
age_train_df.head()

Unnamed: 0,id,age
0,000000013CB5719C0000A2C90002C101,53
1,00000001442BE24000001B7D00F50801,48
2,00000001448580F800003F1B31FB0901,28
3,0000000145BDB2FF000157971645E901,44
4,000000014602771F0000DB9359714C01,48


In [None]:
train_df = URLS.merge(age_train_df, on='id', how='left')

In [13]:
sep_train_df = sep_URLS.merge(age_train_df, on = 'id', how='left')

In [None]:
train_df.head()

In [None]:
sep_train_df.head()

In [None]:
sep_train_df.urls[1]

### стемминг слов

In [None]:
stemmer = nltk.stem.LancasterStemmer()

In [None]:
def stemming(x):
    ret_array = []
    array = x.split(" ")
    for word in array:
        ret_array.append(stemmer.stem(word))
    return " ".join(ret_array)

In [None]:
stemming(sep_train_df.urls[1])

In [None]:
stem_train_df = sep_train_df.copy()

In [None]:
stem_train_df.urls = stem_train_df.urls.apply(lambda x: stemming(x))

In [None]:
sep_train_df.head()

In [None]:
stem_train_df.head()

### начнём делать кросс-валидацию и обработку признаков

In [14]:
y_urls = sep_train_df.age.values

In [None]:
train_df.urls = train_df.urls.apply(lambda x: " ".join(x))

In [None]:
train_df.urls[:2]

In [None]:
sep_train_df.urls[:2]

In [None]:
stem_train_df.urls[:2]

In [15]:
tfidf = sklearn.feature_extraction.text.TfidfVectorizer()
X = tfidf.fit_transform(train_df.urls.values)

NameError: name 'train_df' is not defined

In [70]:
tfidf = sklearn.feature_extraction.text.TfidfVectorizer()
sep_X = tfidf.fit_transform(sep_train_df.urls.values)

In [None]:
tfidf = sklearn.feature_extraction.text.TfidfVectorizer()
stem_X = tfidf.fit_transform(stem_train_df.urls.values)

In [None]:
print sep_X.shape
print X.shape
print stem_X.shape

In [None]:
score = []
sep_score = []
stem_score = []
for k in range(5):
    mean = []
    sep_mean = []
    stem_mean = []
    for i in range(5):

    
        #train_data, test_data, y_train, y_test = cross_validation.train_test_split(X_nmf, y, \
        #                                                             test_size = 0.3)
    
        #xgb_rg = xgboost.XGBRegressor(n_estimators=n, learning_rate=0.03)
        #xgb_rg.fit(train_data, y_train)
        #print 'xgboost_rmse = ', rmse(xgb_rg.predict(test_data), y_test)
        #mean.append(rmse(xgb_rg.predict(test_data), y_test))
        
        train_data, test_data, y_train, y_test = cross_validation.train_test_split(X, y,\
                                                                                  test_size = 0.3)
        lin_reg = linear_model.SGDRegressor(n_iter=350, alpha=0.000008)
        lin_reg.fit(train_data, y_train)
        print "usual = ",rmse(lin_reg.predict(test_data), y_test)
        mean.append(rmse(lin_reg.predict(test_data), y_test))
        
        train_data, test_data, y_train, y_test = cross_validation.train_test_split(sep_X, y,\
                                                                                    test_size = 0.3)
        
        lin_reg = linear_model.SGDRegressor(n_iter=350, alpha=0.000008)
        lin_reg.fit(train_data, y_train)
        print 'with sep urls ',rmse(lin_reg.predict(test_data), y_test)
        sep_mean.append(rmse(lin_reg.predict(test_data), y_test))
        
        train_data, test_data, y_train, y_test = cross_validation.train_test_split(stem_X, y,\
                                                                                    test_size = 0.3)
        
        lin_reg = linear_model.SGDRegressor(n_iter=450, alpha=0.000008)
        lin_reg.fit(train_data, y_train)
        print 'with stem urls ',rmse(lin_reg.predict(test_data), y_test)
        stem_mean.append(rmse(lin_reg.predict(test_data), y_test))
        
    score.append(np.mean(mean))
    sep_score.append(np.mean(sep_mean))
    stem_score.append(np.mean(stem_mean))
print np.mean(score)
print 'in general with sep =', np.mean(sep_score)
print 'in general with stem =', np.mean(stem_score)
    

### svd

In [None]:
from sklearn import pipeline, preprocessing, feature_extraction

In [None]:
# обычный датасет
dataset1 = Dataset(X[itrain, :], y[itrain], X[itest, :])
# NMF
dataset2 = Dataset(X_nmf[itrain, :], y[itrain], X_nmf[itest, :])

# линейная модель на обычных данных
lr = Regressor(dataset=dataset1, 
                     estimator=linear_model.LinearRegression,
                     parameters={'normalize': True},
                     name='lr')

# линейная модель на TFIDF от обычных данных
def tfidf_lr_(X_train, y_train, X_test, y_test=None, random_state=8888):
    model = pipeline.Pipeline([('tfidf', feature_extraction.text.TfidfTransformer()), 
                                ('linear_model', linear_model.LinearRegression())])
    model.fit(X_train, y_train)
    return model.predict(X_test)

tfidf_lr = Regressor(dataset=dataset1,
                     estimator=tfidf_lr_,
                     name='tfidf_lr')

# ExtraTrees на NMF
rf = Regressor(dataset=dataset2, 
                     estimator=ensemble.ExtraTreesRegressor,
                     parameters={'n_estimators': 100, 'n_jobs': -1},
                     name='rf')

# Stack two models
# Returns new dataset with out-of-fold predictions
meta_pipeline = ModelsPipeline(lr, tfidf_lr, rf)
stack_ds = meta_pipeline.blend(proportion=0.2,seed=111)

# Train LinearRegression on stacked data (second stage)
stacker = Regressor(dataset=stack_ds, estimator=LinearRegression)
results = stacker.predict()

print rmse(y[itest], results)

### Теперь загрузим title_unify_train

In [17]:
title_train = pd.read_csv('data/title_unify_train.csv', header=None, sep='\t')
title_train.head()

Unnamed: 0,0,1,2
0,000000014B6D41C13D777E8314725401,коляна лента прикол,1
1,0000000150707ACB8A82451C0307BC01,candi410 rambler ru входящая рамблер-почта,1
2,0000000150707ACB8A82451C0307BC01,cosmopolitan витамин волос для женщина журнал ...,1
3,0000000150707ACB8A82451C0307BC01,realbox бокс для интернет-магазин страница тов...,1
4,0000000150707ACB8A82451C0307BC01,realbox бокс для интернет-магазин товар экипир...,2


In [18]:
title_train.columns = ['id', 'unify', 'count']

In [19]:
tmp = title_train
tmp['unify'] = tmp['unify'].astype('str')
tmp['space'] = ' '
tmp['array_of_unify'] = (tmp['unify'] + tmp['space']).apply(lambda x: x.split(' ')[:-1]) * tmp['count']
tmp.drop(['space'], axis = 1, inplace=True)
tmp.head()

Unnamed: 0,id,unify,count,array_of_unify
0,000000014B6D41C13D777E8314725401,коляна лента прикол,1,"[коляна, лента, прикол]"
1,0000000150707ACB8A82451C0307BC01,candi410 rambler ru входящая рамблер-почта,1,"[candi410, rambler, ru, входящая, рамблер-почта]"
2,0000000150707ACB8A82451C0307BC01,cosmopolitan витамин волос для женщина журнал ...,1,"[cosmopolitan, витамин, волос, для, женщина, ж..."
3,0000000150707ACB8A82451C0307BC01,realbox бокс для интернет-магазин страница тов...,1,"[realbox, бокс, для, интернет-магазин, страниц..."
4,0000000150707ACB8A82451C0307BC01,realbox бокс для интернет-магазин товар экипир...,2,"[realbox, бокс, для, интернет-магазин, товар, ..."


In [20]:
UNIFY = pd.DataFrame(tmp.groupby('id').array_of_unify.apply(lambda x: x.tolist()))

In [23]:
UNIFY['list'] = UNIFY.array_of_unify.apply(f)
UNIFY.drop(['array_of_unify'], axis = 1, inplace=True)

In [24]:
UNIFY['id'] = UNIFY.index
UNIFY.index = range(len(UNIFY))
UNIFY.columns = ['unify', 'id']
UNIFY.head()

Unnamed: 0,unify,id
0,"[бесплатный, надёжный, почта, рамблер, электро...",000000013CB5719C0000A2C90002C101
1,"[24-х, 34-х, до, договор, неделя, новость, пре...",00000001442BE24000001B7D00F50801
2,"[авто, бош, контакт, королёв, сервис, авто, бо...",00000001448580F800003F1B31FB0901
3,"[ua, втрать, війни, донбасі, за, на, новини, о...",0000000145BDB2FF000157971645E901
4,"[black, walnut, грецкий, орех, чёрный, inmomen...",000000014602771F0000DB9359714C01


In [25]:
train_un = UNIFY.merge(age_train_df, on='id', how='left')
train_un.head()

Unnamed: 0,unify,id,age
0,"[бесплатный, надёжный, почта, рамблер, электро...",000000013CB5719C0000A2C90002C101,53
1,"[24-х, 34-х, до, договор, неделя, новость, пре...",00000001442BE24000001B7D00F50801,48
2,"[авто, бош, контакт, королёв, сервис, авто, бо...",00000001448580F800003F1B31FB0901,28
3,"[ua, втрать, війни, донбасі, за, на, новини, о...",0000000145BDB2FF000157971645E901,44
4,"[black, walnut, грецкий, орех, чёрный, inmomen...",000000014602771F0000DB9359714C01,48


In [26]:
morph = pymorphy2.MorphAnalyzer()

In [None]:
print type(u'прив')

In [None]:
p = morph.parse(u'привет')[0]
p.normal_form

In [None]:
p = morph.parse(u'привет -ды ls -a')[0]
print p.normal_form
print type(p.normal_form)

In [27]:
def norm_form(x):
    tmp = " ".join(x)
    string = unicode(tmp,'utf-8','ignore')
    p = morph.parse(string)[0]
    return p.normal_form

In [28]:
train_morph = train_un.copy()

In [29]:
train_morph.unify = train_morph.unify.apply(lambda x: norm_form(x))

In [None]:
train_morph.head()

In [83]:
tfidf = sklearn.feature_extraction.text.TfidfVectorizer()
morph_X = tfidf.fit_transform(train_morph.unify.values)

In [31]:
morph_y = train_morph.age

In [None]:
score = []

for k in range(5):
    mean = []
    for i in range(5):
        train_data, test_data, y_train, y_test = cross_validation.train_test_split(morph_X, y,\
                                                                                    test_size = 0.3)
        
        lin_reg = linear_model.SGDRegressor(n_iter=550, alpha=0.00003)
        lin_reg.fit(train_data, y_train)
        print rmse(lin_reg.predict(test_data), y_test)
        mean.append(rmse(lin_reg.predict(test_data), y_test))
        
    score.append(np.mean(mean))
print np.mean(score)

In [None]:
# обычный датасет
dataset1 = Dataset(X[itrain, :], y[itrain], X[itest, :])
# NMF
dataset2 = Dataset(X_nmf[itrain, :], y[itrain], X_nmf[itest, :])

# линейная модель на обычных данных
lr = Regressor(dataset=dataset1, 
                     estimator=linear_model.LinearRegression,
                     parameters={'normalize': True},
                     name='lr')

# линейная модель на TFIDF от обычных данных
def tfidf_lr_(X_train, y_train, X_test, y_test=None, random_state=8888):
    model = pipeline.Pipeline([('tfidf', feature_extraction.text.TfidfTransformer()), 
                                ('linear_model', linear_model.LinearRegression())])
    model.fit(X_train, y_train)
    return model.predict(X_test)

tfidf_lr = Regressor(dataset=dataset1,
                     estimator=tfidf_lr_,
                     name='tfidf_lr')

# ExtraTrees на NMF
rf = Regressor(dataset=dataset2, 
                     estimator=ensemble.ExtraTreesRegressor,
                     parameters={'n_estimators': 100, 'n_jobs': -1},
                     name='rf')

# Stack two models
# Returns new dataset with out-of-fold predictions
meta_pipeline = ModelsPipeline(lr, tfidf_lr, rf)
stack_ds = meta_pipeline.blend(proportion=0.2,seed=111)

# Train LinearRegression on stacked data (second stage)
stacker = Regressor(dataset=stack_ds, estimator=LinearRegression)
results = stacker.predict()

print rmse(y[itest], results)

### данные для отправки

In [60]:
urls_test_df = pd.read_csv('data/url_domain_test.csv', header=None, delimiter='\t')
urls_test_df.columns = ['id', 'url', 'count']
urls_test_df.id.nunique()

19974

In [61]:
urls_test_df.head()

Unnamed: 0,id,url,count
0,0000000151004FF4ADD746DA10685A01,afisha.ru,2
1,0000000151004FF4ADD746DA10685A01,aif.ru,1
2,0000000151004FF4ADD746DA10685A01,aimfar.solution.weborama.fr,1
3,0000000151004FF4ADD746DA10685A01,alkotest.ru,1
4,0000000151004FF4ADD746DA10685A01,aptekamos.ru,1


In [62]:
sep_tmp = urls_test_df
sep_tmp['url'] = sep_tmp['url'].astype('str')
sep_tmp['space'] = ' '
sep_tmp['array_of_url'] = (sep_tmp['url'] + sep_tmp['space']).apply(lambda x: re.findall(r"[\w']+",x)) * sep_tmp['count']
sep_tmp.drop(['space'], axis = 1, inplace=True)
sep_tmp.head()

Unnamed: 0,id,url,count,array_of_url
0,0000000151004FF4ADD746DA10685A01,afisha.ru,2,"[afisha, ru, afisha, ru]"
1,0000000151004FF4ADD746DA10685A01,aif.ru,1,"[aif, ru]"
2,0000000151004FF4ADD746DA10685A01,aimfar.solution.weborama.fr,1,"[aimfar, solution, weborama, fr]"
3,0000000151004FF4ADD746DA10685A01,alkotest.ru,1,"[alkotest, ru]"
4,0000000151004FF4ADD746DA10685A01,aptekamos.ru,1,"[aptekamos, ru]"


In [63]:
TEST_URLS = pd.DataFrame(sep_tmp.groupby('id').array_of_url.apply(lambda x: x.tolist()))

In [64]:
TEST_URLS['list'] = TEST_URLS.array_of_url.apply(sep_f)
TEST_URLS.drop(['array_of_url'], axis = 1, inplace=True)

In [65]:
TEST_URLS['id'] = TEST_URLS.index
TEST_URLS.index = range(len(TEST_URLS))
TEST_URLS.columns = ['urls', 'id']
TEST_URLS.head()

Unnamed: 0,urls,id
0,1000bankov ru 1tv ru 1tv ru 1tv ru 1tv ru 1tv ...,000000014A02348E701552980349FF01
1,autorambler ru bilettorg ru dsol druzhba ru fa...,000000014A10EA183BF8594A0B2AB201
2,photosight ru photosight ru photosight ru phot...,000000014A4FE5C33A929D4C26943601
3,base consultant ru dogovor obrazets ru fd ru m...,000000014B7BB9957784A9BC0AC9F401
4,assessor ru assessor ru audit it ru base garan...,000000014C7749F896D82C2B01E8B801


In [71]:
X_test_urls = tfidf.transform(TEST_URLS.urls.values)

In [72]:
X_test_urls.shape

(19974, 101680)

In [73]:
sep_X.shape

(118603, 101680)

In [74]:
lin_reg = linear_model.SGDRegressor(n_iter=750, alpha=0.000006)
lin_reg.fit(sep_X, y_urls)

SGDRegressor(alpha=6e-06, average=False, epsilon=0.1, eta0=0.01,
       fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling',
       loss='squared_loss', n_iter=750, penalty='l2', power_t=0.25,
       random_state=None, shuffle=True, verbose=0, warm_start=False)

In [75]:
urls_pred = lin_reg.predict(X_test_urls)

### теперь прогноз по title

In [76]:
titles_test_df = pd.read_csv('data/title_unify_test.csv', header=None, delimiter='\t')
titles_test_df.columns = ['id', 'unify', 'count']
titles_test_df.id.nunique()

19960

In [77]:
tmp = titles_test_df
tmp['unify'] = tmp['unify'].astype('str')
tmp['space'] = ' '
tmp['array_of_unify'] = (tmp['unify'] + tmp['space']).apply(lambda x: x.split(' ')[:-1]) * tmp['count']
tmp.drop(['space'], axis = 1, inplace=True)
tmp.head()

Unnamed: 0,id,unify,count,array_of_unify
0,0000000151790DCC1E8322AF0B6FA701,20-километровый амур китай мост недвижимость п...,2,"[20-километровый, амур, китай, мост, недвижимо..."
1,0000000151790DCC1E8322AF0B6FA701,24-х 34-х до договор неделя новость предложить...,1,"[24-х, 34-х, до, договор, неделя, новость, пре..."
2,0000000151790DCC1E8322AF0B6FA701,3xl armour compress heatgear long size sleev s...,1,"[3xl, armour, compress, heatgear, long, size, ..."
3,0000000151790DCC1E8322AF0B6FA701,4-е ca зачёт медальный место новость ои-2012 п...,1,"[4-е, ca, зачёт, медальный, место, новость, ои..."
4,0000000151790DCC1E8322AF0B6FA701,4-колёсный moi-bebik oregon oscar ru коляска к...,1,"[4-колёсный, moi-bebik, oregon, oscar, ru, кол..."


In [78]:
UNIFY_TEST = pd.DataFrame(tmp.groupby('id').array_of_unify.apply(lambda x: x.tolist()))

In [79]:
UNIFY_TEST['list'] = UNIFY_TEST.array_of_unify.apply(f)
UNIFY_TEST.drop(['array_of_unify'], axis = 1, inplace=True)

In [80]:
UNIFY_TEST['id'] = UNIFY_TEST.index
UNIFY_TEST.index = range(len(UNIFY_TEST))
UNIFY_TEST.columns = ['unify', 'id']
UNIFY_TEST.head()

Unnamed: 0,unify,id
0,"[11-й, гектар, дом, коммунизм, на, набережная,...",000000014A02348E701552980349FF01
1,"[20-летний, выглядеть, девушка, как, королева,...",000000014A10EA183BF8594A0B2AB201
2,"[медийный, портал, рамблер, медийный, портал, ...",000000014A4FE5C33A929D4C26943601
3,"[бесплатно, код, онлайн, срочно, статистика, и...",000000014B7BB9957784A9BC0AC9F401
4,"[bank, privat, автокредитование, банковский, б...",000000014C7749F896D82C2B01E8B801


In [81]:
UNIFY_TEST.unify = UNIFY_TEST.unify.apply(lambda x: norm_form(x))

In [84]:
X_test_titles = tfidf.transform(UNIFY_TEST.unify.values)

In [85]:
lin_reg = linear_model.SGDRegressor(n_iter=750, alpha=0.000006)
lin_reg.fit(morph_X, morph_y)

SGDRegressor(alpha=6e-06, average=False, epsilon=0.1, eta0=0.01,
       fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling',
       loss='squared_loss', n_iter=750, penalty='l2', power_t=0.25,
       random_state=None, shuffle=True, verbose=0, warm_start=False)

In [87]:
titles_pred = lin_reg.predict(X_test_titles)

In [88]:
print urls_pred.shape
print titles_pred.shape

(19974,)
(19960,)


In [94]:
TEST_URLS['age'] = urls_pred

In [95]:
TEST_URLS = TEST_URLS[['id', 'age']]
TEST_URLS.columns = ['Id', 'age']

In [96]:
mean_array = TEST_URLS.age.mean()

In [97]:
mean_array

37.937767259367305

In [98]:
TEST_URLS.head()

Unnamed: 0,Id,age
0,000000014A02348E701552980349FF01,40.647063
1,000000014A10EA183BF8594A0B2AB201,42.375781
2,000000014A4FE5C33A929D4C26943601,44.362142
3,000000014B7BB9957784A9BC0AC9F401,33.832962
4,000000014C7749F896D82C2B01E8B801,39.507316


In [99]:
mean_array*np.ones(10)

array([ 37.93776726,  37.93776726,  37.93776726,  37.93776726,
        37.93776726,  37.93776726,  37.93776726,  37.93776726,
        37.93776726,  37.93776726])

In [100]:
random_sol = pd.read_csv('data/sample_submission.csv')
random_sol.head()

Unnamed: 0,Id,age
0,000000014A02348E701552980349FF01,36.04347
1,000000014A10EA183BF8594A0B2AB201,36.04347
2,000000014A4FE5C33A929D4C26943601,36.04347
3,000000014B7BB9957784A9BC0AC9F401,36.04347
4,000000014C7749F896D82C2B01E8B801,36.04347


In [101]:
miss_idx = set(random_sol.Id.values) - set(TEST_URLS.Id.values)
miss_df = pd.DataFrame(zip(list(miss_idx), mean_array*np.ones(len(miss_idx))))
miss_df.columns = ['Id', 'age']

In [102]:
TEST_URLS = TEST_URLS.append(miss_df, ignore_index=True)

In [116]:
urls_pred

array([ 40.64706285,  42.37578091,  44.36214191, ...,  35.65001423,
        35.75656319,  38.389404  ])

In [117]:
titles_pred

array([ 39.21377511,  38.75144059,  34.34612484, ...,  29.9630085 ,
        38.61773698,  36.34510389])

In [118]:
mean_array = UNIFY_TEST.age.mean()

In [103]:
UNIFY_TEST['age'] = titles_pred

In [104]:
UNIFY_TEST = UNIFY_TEST[['id', 'age']]
UNIFY_TEST.columns = ['Id', 'age']

In [106]:
miss_idx = set(random_sol.Id.values) - set(UNIFY_TEST.Id.values)
miss_df = pd.DataFrame(zip(list(miss_idx), mean_array*np.ones(len(miss_idx))))
miss_df.columns = ['Id', 'age']

In [107]:
UNIFY_TEST = UNIFY_TEST.append(miss_df, ignore_index=True)

In [108]:
print UNIFY_TEST.shape
print TEST_URLS.shape

(19979, 2)
(19979, 2)


In [109]:
TEST = TEST_URLS.copy()

In [127]:
TEST['age'] = TEST_URLS['age']

In [128]:
TEST.head()

Unnamed: 0,Id,age
0,000000014A02348E701552980349FF01,40.647063
1,000000014A10EA183BF8594A0B2AB201,42.375781
2,000000014A4FE5C33A929D4C26943601,44.362142
3,000000014B7BB9957784A9BC0AC9F401,33.832962
4,000000014C7749F896D82C2B01E8B801,39.507316


In [129]:
TEST.to_csv('my_solution.csv', index=False)

In [130]:
TEST.shape

(19979, 2)

In [126]:
print urls_test_df.shape
print TEST_URLS.shape

(613388, 4)
(19979, 2)


In [None]:
print urls_test_df.id.nunique()
print TEST_URLS.id.nunique()