In [1]:
import os
import gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from snownlp import SnowNLP
from nltk.corpus import stopwords
from collections import Counter
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score


%matplotlib inline

train=pd.read_csv('input/train.csv')
test=pd.read_csv('input/test.csv')

#print train['question1']

train_qs=pd.Series(train['question1'].tolist()+
                   train['question2'].tolist()).astype(str)

test_qs=pd.Series(test['question1'].tolist()+
                  test['question2'].tolist()).astype(str)

stops=set(stopwords.words("english"))

#计算两句话的共有词
def word_match_share(row):
    q1words={}
    q2words={}
    for word in str(row['question1']).lower().split():
        if word not in stops:#如果不是stopwords则存入q1words=>(key,value)
            q1words[word]=1
    for word in str(row['question2']).lower().split():
        if word not in stops:
            q2words[word]=1
    if len(q1words)==0 or len(q2words)==0:
        return 0
    shared_words_q1=[w for w in q1words.keys() if w in q2words]
    shared_words_q2=[w for w in q2words.keys() if w in q1words]
    R=(len(shared_words_q1)+len(shared_words_q2)+0.0)/(len(q1words)+len(q2words))
    return R

def get_weight(count,eps=10000,min_count=2):
    if count<min_count:
        return 0
    else:
        return 1.0/(count+eps)

eps=5000
words=(" ".join(train_qs)).lower().split()
counts=Counter(words)
weights={word:get_weight(count) for word,count in counts.items()}
    
def tfidf_word_match_share(row):
    q1words = {}
    q2words = {}
    for word in str(row['question1']).lower().split():
        if word not in stops:
            q1words[word] = 1
    for word in str(row['question2']).lower().split():
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    
    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
        
    R = (np.sum(shared_weights)+0.0) / np.sum(total_weights)
    return R

def sentence_sentiment_diff(row):
    s1=SnowNLP(str(row['question1'])).sentiments
    s2=SnowNLP(str(row['question2'])).sentiments
    return (s1-s2)*(s1-s2)


print 'over'

  interactivity=interactivity, compiler=compiler, result=result)


over


In [2]:
print 'construct dataset'
x_train = pd.DataFrame()
x_test = pd.DataFrame()

In [6]:
#构造train,valid,test set

print 'construct train data feature'

x_train['word_match'] = train.apply(word_match_share,axis=1,raw=True)
x_train['tfidf_word_match'] = train.apply(tfidf_word_match_share, axis=1, raw=True)

print 'over'

construct train data feature




over


In [7]:
print 'construct test data feature'

x_test['word_match'] = test.apply(word_match_share, axis=1, raw=True)
x_test['tfidf_word_match'] = test.apply(tfidf_word_match_share,axis=1,raw=True)

print 'over'

construct test data feature




over


In [6]:
print 'sentence_sentiment'

x_train['sentiment']=train.apply(sentence_sentiment_diff,axis=1,raw=True)
x_test['sentiment']=train.apply(sentence_sentiment_diff,axis=1,raw=True)

print 'over'


sentence_sentiment
over


In [17]:
import cPickle
import pandas as pd
import numpy as np
import gensim
import math
from fuzzywuzzy import fuzz
from nltk.corpus import stopwords
from tqdm import tqdm
from scipy.stats import skew, kurtosis
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis
from nltk import word_tokenize

print 'basic feature '

#x_train['len_q1'] = train.question1.apply(lambda x: len(str(x)))
#x_train['len_q2'] = train.question2.apply(lambda x: len(str(x)))
#x_train['len_char_q1'] = train.question1.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
#x_train['len_char_q2'] = train.question2.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
#x_train['len_word_q1'] = train.question1.apply(lambda x: len(str(x).split()))
#x_train['len_word_q2'] = train.question2.apply(lambda x: len(str(x).split()))

len_q1=train.question1.apply(lambda x: len(str(x)))
len_q2=train.question2.apply(lambda x: len(str(x)))
x_train['diff_len'] = abs(len_q1-len_q2)

len_char_q1=train.question1.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
len_char_q2=train.question2.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
x_train['diff_len_char']=abs(len_char_q1-len_char_q2)

len_word_q1=train.question1.apply(lambda x: len(str(x).split()))
len_word_q2=train.question2.apply(lambda x: len(str(x).split()))
x_train['diff_len_word']=abs(len_word_q1-len_word_q2)

#x_train['common_words'] = train.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)

x_train['fuzz_qratio'] = train.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1)
x_train['fuzz_WRatio'] = train.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1)
x_train['fuzz_partial_ratio'] = train.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1)
x_train['fuzz_partial_token_set_ratio'] = train.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
x_train['fuzz_partial_token_sort_ratio'] = train.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)
x_train['fuzz_token_set_ratio'] = train.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
x_train['fuzz_token_sort_ratio'] = train.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)


print 'training data is over'

#x_test['len_q1'] = test.question1.apply(lambda x: len(str(x)))
#x_test['len_q2'] = test.question2.apply(lambda x: len(str(x)))
#x_test['diff_len'] = x_test.len_q1 - x_test.len_q2
#x_test['len_char_q1'] = test.question1.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
#x_test['len_char_q2'] = test.question2.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
#x_test['len_word_q1'] = test.question1.apply(lambda x: len(str(x).split()))
#x_test['len_word_q2'] = test.question2.apply(lambda x: len(str(x).split()))

len_q1=test.question1.apply(lambda x: len(str(x)))
len_q2=test.question2.apply(lambda x: len(str(x)))
x_test['diff_len'] = abs(len_q1-len_q2)

len_char_q1=test.question1.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
len_char_q2=test.question2.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
x_test['diff_len_char']=abs(len_char_q1-len_char_q2)

len_word_q1=test.question1.apply(lambda x: len(str(x).split()))
len_word_q2=test.question2.apply(lambda x: len(str(x).split()))
x_test['diff_len_word']=abs(len_word_q1-len_word_q2)
#x_test['common_words'] = test.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)

x_test['fuzz_qratio'] = test.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1)
x_test['fuzz_WRatio'] = test.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1)
x_test['fuzz_partial_ratio'] = test.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1)
x_test['fuzz_partial_token_set_ratio'] = test.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
x_test['fuzz_partial_token_sort_ratio'] = test.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)
x_test['fuzz_token_set_ratio'] = test.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
x_test['fuzz_token_sort_ratio'] = test.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)

print 'test data is over'


basic feature 
training data is over
test data is over


In [33]:
df1 = train[['question1']].copy()
df2 = train[['question2']].copy()
df1_test = test[['question1']].copy()
df2_test = test[['question2']].copy()

df2.rename(columns = {'question2':'question1'},inplace=True)
df2_test.rename(columns = {'question2':'question1'},inplace=True)

train_questions = df1.append(df2)
train_questions = train_questions.append(df1_test)
train_questions = train_questions.append(df2_test)

print 'origin train_questions length:'

print len(train_questions)

#drop duplicated questions in train_questions
train_questions.drop_duplicates(subset = ['question1'],inplace=True)

print 'after remove duplicates length:'

print len(train_questions)

#reset index of train_questions
train_questions.reset_index(inplace=True,drop=True)

#construct new Series (index,question)
questions_dict = pd.Series(train_questions.index.values,index=train_questions.question1.values).to_dict()

train_cp = train.copy()
test_cp = test.copy()

train_cp.drop(['qid1','qid2'],axis=1,inplace=True)

test_cp['is_duplicate'] = -1
test_cp.rename(columns={'test_id':'id'},inplace=True)
comb = pd.concat([train_cp,test_cp])


comb['q1_hash'] = comb['question1'].map(questions_dict)
comb['q2_hash'] = comb['question2'].map(questions_dict)

q1_vc = comb.q1_hash.value_counts().to_dict()
q2_vc = comb.q2_hash.value_counts().to_dict()

def try_apply_dict(x,dict_to_apply):
    try:
        return dict_to_apply[x]
    except KeyError:
        return 0
#map to frequency space
comb['q1_freq'] = comb['q1_hash'].map(lambda x: try_apply_dict(x,q1_vc) + try_apply_dict(x,q2_vc))
comb['q2_freq'] = comb['q2_hash'].map(lambda x: try_apply_dict(x,q1_vc) + try_apply_dict(x,q2_vc))

train_comb = comb[comb['is_duplicate'] >= 0][['id','q1_hash','q2_hash','q1_freq','q2_freq','is_duplicate']]
test_comb = comb[comb['is_duplicate'] < 0][['id','q1_hash','q2_hash','q1_freq','q2_freq']]

x_train['q1_freq']=train_comb['q1_freq']
x_train['q2_freq']=train_comb['q2_freq']
#x_train['freq_diff']=abs(train_comb['q1_freq']-train_comb['q2_freq'])

x_test['q1_freq']=test_comb['q1_freq']
x_test['q2_freq']=test_comb['q2_freq']

#x_test['freq_diff']=abs(test_comb['q1_freq']-test_comb['q2_freq'])

print 'over'


origin train_questions length:
5500172
after remove duplicates length:
4789515
over


In [38]:
print 'rebalance data'

y_train=train['is_duplicate'].values

pos_train=x_train[y_train == 1]
neg_train=x_train[y_train == 0]

p = 0.165
scale = (((len(pos_train) +0.0)/ (len(pos_train) + len(neg_train))) / p) - 1
print scale
while scale > 1:
    neg_train = pd.concat([neg_train, neg_train])
    scale -=1
neg_train = pd.concat([neg_train, neg_train[:int(scale * len(neg_train))]])
print((len(pos_train)+0.0) / (len(pos_train) + len(neg_train)))

x_train_r = pd.concat([pos_train, neg_train])
y_train_r = (np.zeros(len(pos_train)) + 1).tolist() + np.zeros(len(neg_train)).tolist()
del pos_train, neg_train

x_train_r,x_valid_r,y_train_r,y_valid_r=train_test_split(
    x_train_r,y_train_r,test_size=0.2, random_state=4242)

print 'over'

rebalance data
1.2401128118
0.191204686368
over


In [41]:

#xgboost 训练

print 'Xgboost train'

params={}
params['objective']='binary:logistic'
params['eval_metric']='logloss'
params['eta']=0.02
params['max_depth']=4

d_train=xgb.DMatrix(x_train_r,label=y_train_r)
d_valid=xgb.DMatrix(x_valid_r,label=y_valid_r)

watchlist=[(d_train,'train'),(d_valid,'valid')]

bst=xgb.train(params,d_train,500,watchlist,early_stopping_rounds=50,verbose_eval=10)

d_test=xgb.DMatrix(x_test)
p_test=bst.predict(d_test)

#result
sub=pd.DataFrame()
sub['test_id']=test['test_id']
sub['is_duplicate']=p_test
sub.to_csv('xgb.csv',index=False)

Xgboost train
[0]	train-logloss:0.579371	valid-logloss:0.578989
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[10]	train-logloss:0.292229	valid-logloss:0.290679
[20]	train-logloss:0.26221	valid-logloss:0.261798
[30]	train-logloss:0.253969	valid-logloss:0.254726
[40]	train-logloss:0.248542	valid-logloss:0.251109
[50]	train-logloss:0.244687	valid-logloss:0.248718
[60]	train-logloss:0.241939	valid-logloss:0.247099
[70]	train-logloss:0.239794	valid-logloss:0.246067
[80]	train-logloss:0.237486	valid-logloss:0.244945
[90]	train-logloss:0.235822	valid-logloss:0.244094
[100]	train-logloss:0.233922	valid-logloss:0.243143
[110]	train-logloss:0.232155	valid-logloss:0.241936
[120]	train-logloss:0.230098	valid-logloss:0.241002
[130]	train-logloss:0.228986	valid-logloss:0.240632
[140]	train-logloss:0.22728	valid-logloss:0.239746
[150]	train-logloss:0.225677	valid-logloss:0.239283
[160]	train-logl

In [37]:
import nltk

tokens=nltk.word_tokenize('What is like to have sex with cousin?.....')
tokens=nltk.pos_tag(tokens)
print tokens
tree=nltk.ne_chunk(tokens)
print tree

print '---------------'
tokens=nltk.word_tokenize('What is it like to have sex with your cousin?')
tokens=nltk.pos_tag(tokens)
print tokens
tree=nltk.ne_chunk(tokens)
print tree

LookupError: 
**********************************************************************
  Resource u'taggers/averaged_perceptron_tagger/averaged_perceptro
  n_tagger.pickle' not found.  Please use the NLTK Downloader to
  obtain the resource:  >>> nltk.download()
  Searched in:
    - '/home/djh/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************

In [56]:
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

corpus=["What is the step by step guide to invest in share market in india?",
         "What is the story of Kohinoor (Koh-i-Noor) Diamond?",
         "What is like to have sex with cousin?"]
vectorizer=CountVectorizer()
transformer=TfidfTransformer()
tfidf=transformer.fit_transform(vectorizer.fit_transform(corpus))
word=vectorizer.get_feature_names()
weight=tfidf.toarray()
print train_qs.tail()
for i in range(len(weight)):
    print u"--------",corpus[i]
    for j in range(len(word)):
        print word[j],weight[i][j]


808575    How many keywords are there in PERL Programmin...
808576           Is it true that there is life after death?
808577                                    What's this coin?
808578    I am having little hairfall problem but I want...
808579        What is it like to have sex with your cousin?
dtype: object
-------- What is the step by step guide to invest in share market in india?
by 0.251144897563
cousin 0.0
diamond 0.0
guide 0.251144897563
have 0.0
in 0.502289795126
india 0.251144897563
invest 0.251144897563
is 0.148330222231
koh 0.0
kohinoor 0.0
like 0.0
market 0.251144897563
noor 0.0
of 0.0
sex 0.0
share 0.251144897563
step 0.502289795126
story 0.0
the 0.19100216797
to 0.19100216797
what 0.148330222231
with 0.0
-------- What is the story of Kohinoor (Koh-i-Noor) Diamond?
by 0.0
cousin 0.0
diamond 0.370725138666
guide 0.0
have 0.0
in 0.0
india 0.0
invest 0.0
is 0.218956238963
koh 0.370725138666
kohinoor 0.370725138666
like 0.0
market 0.0
noor 0.370725138666
of 0.370725138666
s

In [68]:
words=(" ".join(train_qs)).lower().split()
counts=Counter(words)
weights={word:get_weight(count) for word,count in counts.items()}


2.5892380908e-06


In [14]:
from snownlp import SnowNLP
#s=SnowNLP(u'今天天气真好!')
text = u'''
How can I see all my bad Youtube comments
'''

text1 = u'''
How do I read and find my wonderful YouTube comments
'''

s=SnowNLP(text)
s1=SnowNLP(text1)

print '关键词：'
for keyword in s.keywords(5):
    print keyword

print '-------------'
    
for keyword in s1.keywords(5):
    print keyword

print '------------------'
print s.sentiments
print s1.sentiments

关键词：
I
How
bad
Youtube
comments
-------------
wonderful
read
find
I
YouTube
------------------
0.0454071222115
0.10363455497
