In [None]:
import pandas as pd
import numpy as np
import gensim
from scipy.stats import skew, kurtosis
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis
from gensim.models.word2vec import Word2Vec

In [None]:
class EpochSaver(gensim.models.callbacks.CallbackAny2Vec):
    '''用于保存模型, 打印损失函数等等'''
    def __init__(self, savedir, save_name="word2vector.model"):
        os.makedirs(savedir, exist_ok=True)
        self.save_path = os.path.join(savedir, save_name)
        self.epoch = 0
        self.pre_loss = 0
        self.best_loss = 999999999.9
        self.since = time.time()

    def on_epoch_end(self, model):
        self.epoch += 1
        cum_loss = model.get_latest_training_loss() # 返回的是从第一个epoch累计的
        epoch_loss = cum_loss - self.pre_loss
        time_taken = time.time() - self.since
        print("Epoch %d, loss: %.2f, time: %dmin %ds" % 
                    (self.epoch, epoch_loss, time_taken//60, time_taken%60))
        if self.best_loss > epoch_loss:
            self.best_loss = epoch_loss
            print("Better model. Best loss: %.2f" % self.best_loss)
            model.save(self.save_path)
            print("Model %s save done!" % self.save_path)

        self.pre_loss = cum_loss
        self.since = time.time()

In [None]:
df_train =pd.read_csv('/home/kesci/input/bytedance/train_final.csv',names=['qId','q','aId','a','target'],nrows=50000000)
df_test=pd.read_csv('/home/kesci/input/bytedance/bytedance_contest.final_2.csv',names=['qId','q','aId','a'])
df_test['a']=df_test['a'].apply(lambda x:x[:-1])
w2vmodel=Word2Vec.load('/home/kesci/work/w2vfinal_all.model')

In [None]:
def q_fun(df):
    print('start')
    result=[]
    def sent2vec(s):
        words = str(s).split()
        M = []
        for w in words:
            try:
                M.append(w2vmodel[w])
            except:
                continue
        M = np.array(M)
        v = M.sum(axis=0)
        return v / np.sqrt((v ** 2).sum())
    def w2v_distances(q,a):
        x = np.nan_to_num(sent2vec(q))
        y = np.nan_to_num(sent2vec(a))
        result.append([cosine(x, y),cityblock(x, y),canberra(x, y),euclidean(x, y),minkowski(x, y, 3),braycurtis(x, y)])
    df.apply(lambda row: w2v_distances(row['q'], row['a']), axis=1)
    return pd.DataFrame(result)

In [None]:
df1=df_train[['q','a']].iloc[0:10000000,]
df2=df_train[['q','a']].iloc[10000000:20000000,]
df3=df_train[['q','a']].iloc[20000000:30000000,]
df4=df_train[['q','a']].iloc[30000000:40000000,]
df5=df_train[['q','a']].iloc[40000000:50000000,]

df6=df_test[['q','a']].iloc[0:10000000,]
df7=df_test[['q','a']].iloc[10000000:20000000,]
df8=df_test[['q','a']].iloc[20000000:30000000,]
df9=df_test[['q','a']].iloc[30000000:40000000,]
df10=df_test[['q','a']].iloc[40000000:50000000,]
df11=df_test[['q','a']].iloc[50000000:60000000,]
df12=df_test[['q','a']].iloc[60000000:70000000,]
df13=df_test[['q','a']].iloc[70000000:80000000,]
df14=df_test[['q','a']].iloc[80000000:90000000,]
df15=df_test[['q','a']].iloc[90000000:,]

In [None]:
del df_train,df_test
import gc
gc.collect()
from multiprocessing import cpu_count
from multiprocessing import Pool
print(cpu_count())

In [None]:
p= Pool(15)
df1=p.apply_async(q_fun, args=(df1,))
df2=p.apply_async(q_fun, args=(df2,))
df3=p.apply_async(q_fun, args=(df3,))
df4=p.apply_async(q_fun, args=(df4,))
df5=p.apply_async(q_fun, args=(df5,))
df6=p.apply_async(q_fun, args=(df6,))
df7=p.apply_async(q_fun, args=(df7,))
df8=p.apply_async(q_fun, args=(df8,))
df9=p.apply_async(q_fun, args=(df9,))
df10=p.apply_async(q_fun, args=(df10,))
df11=p.apply_async(q_fun, args=(df11,))
df12=p.apply_async(q_fun, args=(df12,))
df13=p.apply_async(q_fun, args=(df13,))
df14=p.apply_async(q_fun, args=(df14,))
df15=p.apply_async(q_fun, args=(df15,))
p.close()
p.join()

In [None]:
df1=df1.get()
df2=df2.get()
df3=df3.get()
df4=df4.get()
df5=df5.get()
df6=df6.get()
df7=df7.get()
df8=df8.get()
df9=df9.get()
df10=df10.get()
df11=df11.get()
df12=df12.get()
df13=df13.get()
df14=df14.get()
df15=df15.get()

In [None]:
train=pd.concat([df1,df2,df3,df4,df5])
test=pd.concat([df6,df7,df8,df9,df10,df11,df12,df13,df14,df15])
train.columns=['w2vcos','w2vcity','w2vcanb','w2veuc','w2vmin','w2vbray']
test.columns=['w2vcos','w2vcity','w2vcanb','w2veuc','w2vmin','w2vbray']
train.to_csv('/home/kesci/work/train_feature_w2vdis_final.csv',index=False)
test.to_csv('/home/kesci/work/test_feature_w2vdis_final.csv',index=False)