In [1]:
import pandas as pd
import numpy as np
import operator
import re
import sys
import os

# Extract Features From Test_Set

In [2]:
def extract_normal_features(path_in, path_normal):
    
    raw_data = pd.read_csv(path_in)
    
    # feature1 : num of words in answer_content
    answer_content = raw_data['txt_content']
    answer_content = answer_content.str.split(" +")
    f1 = answer_content.str.len()
    
    # feature2 : num of words in question_content
    question_content = raw_data['title']
    question_content = question_content.str.split(" +")
    f2 = question_content.str.len()
    
    # feature3 : num of question tags 
    question_tags = raw_data['tag']
    question_tags = question_tags.str.split(" +")
    f3 = question_tags.str.len()
    
    # feature4 : num of answers 
    numOfAnswers = raw_data['n_ans_x']
    f4 = numOfAnswers
    
    # feature5 : num of agrees of user 
    numOfAgrees = raw_data['n_agree']
    f5 = numOfAgrees
    
    # feature6 : num of ans_y of user
    numOfAnsY = raw_data['n_ans_y']
    f6 = numOfAnsY
    
    # feature7 : num of Articles of user
    numOfArticle = raw_data['n_article']
    f7 = numOfArticle
    
    # feature8 : num of Ask of user 
    numOfAsk = raw_data['n_ask']
    f8 = numOfAsk
    
    # feature9 : num of Collection of user
    numOfCollection = raw_data['n_collection']
    f9 = numOfCollection
    
    # feature10 : num of edit log of user
    numOfEditLog = raw_data['n_editlog']
    f10 = numOfEditLog
    
    # feature11 : num of follower of user
    numOfFollower = raw_data['n_follower']
    f11 = numOfFollower
    
    # feature12 : num of thanks of user
    numOfThanks = raw_data['n_thanks']
    f12 = numOfThanks
    
    # feature13 : num of topic of user
    numOfTopic = raw_data['n_topic']
    f13 = numOfTopic
    
    # feature14 : num of followee of user 
    numOfFollowee = raw_data['n_followee']
    f14 = numOfFollowee
    
    f1.to_csv(path_normal+'/f1.txt')
    f2.to_csv(path_normal+'/f2.txt')
    f3.to_csv(path_normal+'/f3.txt')
    f4.to_csv(path_normal+'/f4.txt')
    f5.to_csv(path_normal+'/f5.txt')
    f6.to_csv(path_normal+'/f6.txt')
    f7.to_csv(path_normal+'/f7.txt')
    f8.to_csv(path_normal+'/f8.txt')
    f9.to_csv(path_normal+'/f9.txt')
    f10.to_csv(path_normal+'/f10.txt')
    f11.to_csv(path_normal+'/f11.txt')
    f12.to_csv(path_normal+'/f12.txt')
    f13.to_csv(path_normal+'/f13.txt')
    f14.to_csv(path_normal+'/f14.txt')

In [3]:
def extract_text_features(path_in, path_text):
    
    raw_data = pd.read_csv(path_in)

    # Process with the question_content
    question_content = raw_data['title']
    question_content.to_csv(path_text+'/question_content.txt', encoding='utf-8', index = False)

    # Process with the answer_content
    answer_content = raw_data['txt_content']
    answer_content.to_csv(path_text+'/answer_content.txt', encoding='utf-8', index = False)

    # Process with question_tags
    question_tags = raw_data['tag']
    question_tags.to_csv(path_text+'/question_tags.txt', encoding='utf-8', index = False)

In [4]:
from itertools import izip

def unique(a):
    return list(set(a))

def intersect(a,b):
    return list(set(a)&set(b))

def union(a,b):
    return list(set(a) | set(b))

def write_overlap_file(fout, overlap_list, count):
    num_overlap = len(overlap_list) 
    fout.write(str(count)+',')
    fout.write(' '.join(overlap_list))
    fout.write(','+str(num_overlap)+'\n')   

    
# Run this code for best_answer_features
# PATH_IN = 'DIVIDE/match/best_answer_features/OVER_LAP/'
# PATH_OUT = 'DIVIDE/match/best_answer_features/OVER_LAP/'
# PATH_FEATURES = 'DIVIDE/match/best_answer_features/'

def extract_overlap_features(PATH_TEXT, PATH_OVERLAP, PATH_NORMAL) :

    f15 = open(PATH_NORMAL + 'f15.txt', 'w')
    f16 = open(PATH_NORMAL + 'f16.txt', 'w')
   
    with open(PATH_TEXT + 'answer_content.txt' , 'r') as fin_1, \
        open(PATH_TEXT + 'question_content.txt', 'r') as fin_2, \
        open(PATH_TEXT + 'question_tags.txt' , 'r') as fin_3, \
        open(PATH_OVERLAP + 'answerContent_questionContent.txt', 'w') as fout_1, \
        open(PATH_OVERLAP + 'answerContent_questionTags.txt', 'w') as fout_2 :
            
            count = 0

            for line1, line2, line3 in izip( fin_1, fin_2, fin_3 ):
                # answer_content
                word_list1 = line1.strip().split(" ")
                # question_content
                word_list2 = line2.strip().split(" ")
                # question_tags
                word_list3 = line3.strip().split(" ")
                
                # answer_content & question_content
                over_lap1 = intersect(word_list1, word_list2)
                # answer_content & question_tags
                over_lap2 = intersect(word_list1, word_list3)
                
                write_overlap_file(fout_1, over_lap1, count)
                write_overlap_file(fout_2, over_lap2, count)
                
                # overlap features : answer_content & question_content overlap
                f15.write(str(count) + ',' + str( len(over_lap1) ) + '\n') 
                # overalp features : answer_content & question_tags overlap
                f16.write(str(count) + ',' + str( len(over_lap2) ) + '\n')
                
                count = count + 1
    
    f15.close()
    f16.close()

In [5]:
def lda_source(PATH_IN, PATH_LDA) :
    
    raw_data = pd.read_csv(PATH_IN)
    
    raw_data['title'] = raw_data['title'].fillna('')
    raw_data['txt_content'] = raw_data['txt_content'].fillna('')
    raw_data['tag'] = raw_data['tag'].fillna('')
    
    text_source = raw_data['title'] + ' ' + \
                raw_data['txt_content'] + ' ' + \
                raw_data['tag'] 
    
    text_source.to_csv(PATH_LDA + 'TEST_LDA_SOURCE.txt', encoding='utf-8', index = False, header=False)

In [6]:
def normalize( PATH_NORMAL ) :
    
    # feature1 : num of words in answer_content
    f1 = pd.read_csv(PATH_NORMAL + 'f1.txt', names = ['index','len_answer_content'])

    # feature2 : num of words in question_content
    f2 = pd.read_csv(PATH_NORMAL + 'f2.txt', names = ['index','len_question_content'])
    m = pd.merge(f1, f2, how='outer')
    #print m

    # feature3 : num of question tags 
    f3 = pd.read_csv(PATH_NORMAL + 'f3.txt', names = ['index', 'num_question_tags'])
    m = pd.merge(m, f3, how='outer')

    # feature4 : num of answers 
    f4 = pd.read_csv(PATH_NORMAL + 'f4.txt', names = ['index','num_answers'])
    m = pd.merge(m, f4, how='outer')
    #print m

    # feature5 : num of agrees of user 
    f5 = pd.read_csv(PATH_NORMAL + 'f5.txt', names = ['index','num_user_agrees'])
    m = pd.merge(m, f5, how='outer')

    # feature6 : num of ans_y of user
    f6 = pd.read_csv(PATH_NORMAL + 'f6.txt', names = ['index','num_ans_y'])
    m = pd.merge(m, f6, how='outer')

    # feature7 : num of Articles of user
    f7 = pd.read_csv(PATH_NORMAL + 'f7.txt', names = ['index','num_user_articles'])
    m = pd.merge(m, f7, how='outer')

    # feature8 : num of Ask of user 
    f8 = pd.read_csv(PATH_NORMAL + 'f8.txt', names = ['index', 'num_user_ask'])
    m = pd.merge(m, f8, how='outer')

    # feature9 : num of Collection of user
    f9 = pd.read_csv(PATH_NORMAL + 'f9.txt', names = ['index','num_user_collection'])
    m = pd.merge(m, f9, how='outer')

    # feature10 : num of edit log of user
    f10 = pd.read_csv(PATH_NORMAL + 'f10.txt', names = ['index','num_user_editlog'])
    m = pd.merge(m, f10, how='outer')

    # feature11 : num of follower of user
    f11 = pd.read_csv(PATH_NORMAL + 'f11.txt', names = ['index','num_user_follower'])
    m = pd.merge(m, f11, how='outer')

    # feature12 : num of thanks of user
    f12 = pd.read_csv(PATH_NORMAL + 'f12.txt', names = ['index','num_user_thanks'])
    m = pd.merge(m, f12, how='outer')

    # feature13 : num of topic of user
    f13 = pd.read_csv(PATH_NORMAL + 'f13.txt', names = ['index','num_user_topic'])
    m = pd.merge(m, f13, how='outer')
    
    # feature14 : num of followee of user 
    f14 = pd.read_csv(PATH_NORMAL + 'f14.txt', names = ['index','num_user_followee'])
    m = pd.merge(m, f14, how='outer')
    
    # feature15 : answerContent & questionContent overlap
    f15 = pd.read_csv(PATH_NORMAL + 'f15.txt', names = ['index', 'ac_qc_overlap'])
    m = pd.merge(m, f15, how = 'outer')

    # feature16 : answerContent & questionTags overlap
    f16 = pd.read_csv(PATH_NORMAL + 'f16.txt', names = ['index', 'ac_qt_overlap'])
    m = pd.merge(m, f16, how = 'outer')
    
    #######################################################################
    Index = m.ix[:,0] 
    M = m.ix[:,1:]

    # M = (M - M.min()) / (M.max() - M.min())
    M = (M - M.mean()) / M.std()
    M['index'] = Index

    M = M.drop('index', axis = 1)
    M = M.fillna(0)

    M.to_csv(PATH_NORMAL + 'Normal_Features.txt',  index = False)

# Format S2V & Topic Features

In [7]:
from shutil import copyfile

In [8]:
def remove_first_line(file_name) :
    with open(file_name, 'r') as fin:
        data = fin.read().splitlines(True)
    with open(file_name, 'w') as fout:
        fout.writelines(data[1:])

In [9]:
# USE S2V model to generate S2V features
def generate_s2v_features() :
    copyfile('./Features/Topic/TEST_LDA_SOURCE.txt', './Features/S2V/TEST_S2V_SOURCE.txt')
    %run /home/zpgao/ML/Best_Answer/Zhihu/Step05_ranking_model/TEST/Features/S2V/demo.py

In [10]:
def format_s2v_featues(PATH_S2V) :
    s2v_list = []
    for i in range(50) :
        col_name = 's2v_' + str(i) 
        s2v_list.append(col_name)
    # !!! before this , you should remove the first row of the Test_Source.txt.vec
    remove_first_line(PATH_S2V + 'TEST_S2V_SOURCE.txt.vec')
    total_s2v = pd.read_table(PATH_S2V + 'TEST_S2V_SOURCE.txt.vec', sep = ' ', names = s2v_list)
    # test_s2v !!! very important : Normalization
    total_s2v = (total_s2v - total_s2v.mean()) / total_s2v.std()
    # total_s2v = (total_s2v - total_s2v.min()) / (total_s2v.max() - total_s2v.min())
    # total_s2v # 11292 rows × 50 columns
    total_s2v.to_csv(PATH_S2V + 'S2V_Features.csv', encoding='utf-8', index = False)

In [11]:
def generate_topic_features():
    %run /home/zpgao/ML/Best_Answer/Zhihu/Step05_ranking_model/TEST/Features/Topic/Generate_Topic_Features.py

In [12]:
# format the topic features
def format_topic_features(PATH_LDA) :
    
    topic_list = []
    for i in range(50) :
        col_name = 'topic_' + str(i) 
        topic_list.append(col_name)

    total_topic = pd.read_csv(PATH_LDA + 'Raw_Topic_Features.csv', names = topic_list, index_col=False)
    # total_topic # 29385 rows × 50 columns
    # test_topic !!! very important : Normalization by row
    total_topic = total_topic.sub( total_topic.min(axis=1), axis=0 ) 
    total_topic = total_topic.div( total_topic.max(axis=1) - total_topic.min(axis=1), axis=0 )
    # Normalize by column
    total_topic = (total_topic - total_topic.mean()) / total_topic.std()
    total_topic.to_csv(PATH_LDA + 'Topic_Features.csv', encoding='utf-8', index=False)

In [13]:
def merge_features(PATH_NORMAL, PATH_S2V, PATH_LDA, PATH_FINAL, FILE) :
    
    Total_Normal = pd.read_csv(PATH_NORMAL + 'Normal_Features.txt')
    Total_S2V = pd.read_csv(PATH_S2V + 'S2V_Features.csv')
    Total_Topic = pd.read_csv(PATH_LDA + 'Topic_Features.csv')
    
    Total_Normal_S2V = pd.merge(Total_Normal, Total_S2V, left_index=True, right_index=True)
    Total_Final = pd.merge(Total_Normal_S2V, Total_Topic, left_index=True, right_index=True)
    
    Total_Final.to_csv(PATH_FINAL + FILE, encoding='utf-8', index=False)

In [24]:
# LEVEL = './1000x5/'
# LEVEL = './1000x6/'
# LEVEL = './1000x7/'
# LEVEL = './1000x8/'
# LEVEL = './1000x9/'
LEVEL = './1000x10/'

PATH_NORMAL = './Features/Normal/'
PATH_TEXT = './Features/Text/'
PATH_LDA = './Features/Topic/'
PATH_S2V = './Features/S2V/'

PATH_FINAL_TEST_SET = LEVEL + 'Final_Test_Set/'
PATH_FINAL_TEST_FEATURES = LEVEL + 'Final_Test_Features/'

In [None]:
for fname in os.listdir(PATH_FINAL_TEST_SET) :
    if fname == '.' :
        continue
    if fname.startswith('Len') :
        continue
        
    PATH_IN = os.path.join(PATH_FINAL_TEST_SET, fname)
    print PATH_IN
    
    extract_normal_features(PATH_IN, PATH_NORMAL)
    extract_text_features(PATH_IN, PATH_TEXT)
    extract_overlap_features(PATH_TEXT, PATH_TEXT, PATH_NORMAL)
    lda_source(PATH_IN, PATH_LDA)
    normalize(PATH_NORMAL)
    
    generate_s2v_features()
    format_s2v_featues(PATH_S2V)
    
    generate_topic_features()
    format_topic_features(PATH_LDA)
    
    merge_features(PATH_NORMAL, PATH_S2V, PATH_LDA, PATH_FINAL_TEST_FEATURES, fname )  

./1000x9/Final_Test_Set/Final_Test_08.csv
------------------------------------------------------


Inferred types from first line of file as 
column_type_hints=[str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------
./1000x9/Final_Test_Set/Final_Test_06.csv


------------------------------------------------------

  tf = pd.read_table('/home/zpgao/ML/Best_Answer/Zhihu/Step05_ranking_model/TEST/Features/Topic/Raw_Topic_Features.csv', sep = ' |"|\[|\]', header=None)
  tf = pd.read_table('/home/zpgao/ML/Best_Answer/Zhihu/Step05_ranking_model/TEST/Features/Topic/Raw_Topic_Features.csv', sep = ' |"|\[|\]', header=None)



Inferred types from first line of file as 
column_type_hints=[str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------
./1000x9/Final_Test_Set/Final_Test_05.csv


------------------------------------------------------


Inferred types from first line of file as 
column_type_hints=[str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------
./1000x9/Final_Test_Set/Final_Test_01.csv


------------------------------------------------------

  tf = pd.read_table('/home/zpgao/ML/Best_Answer/Zhihu/Step05_ranking_model/TEST/Features/Topic/Raw_Topic_Features.csv', sep = ' |"|\[|\]', header=None)
  tf = pd.read_table('/home/zpgao/ML/Best_Answer/Zhihu/Step05_ranking_model/TEST/Features/Topic/Raw_Topic_Features.csv', sep = ' |"|\[|\]', header=None)



Inferred types from first line of file as 
column_type_hints=[str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------
./1000x9/Final_Test_Set/Final_Test_00.csv


------------------------------------------------------


Inferred types from first line of file as 
column_type_hints=[str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------
./1000x9/Final_Test_Set/Final_Test_03.csv