In [1]:
import numpy as np
import pandas as pd
import time
import re
import scipy.sparse as sps
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.preprocessing import StandardScaler
import operator
from collections import OrderedDict
import nltk
from nltk.corpus import stopwords
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist

In [2]:
train_path = 'train-contest.csv'
data = pd.read_csv(train_path)

#Попробуем добавить несколько статистик

##Информация о тексте

Разобьем BodyMarkdown на код и текст

In [6]:
data['Code'] = data.BodyMarkdown.apply(lambda text: [line for line in text.split('\n') if 
                                                     line.startswith('    ') or line.startswith('\t')])
data['Text'] = data.BodyMarkdown.apply(lambda text: [line for line in text.split('\n') if not
                                                     line.startswith('    ') and not line.startswith('\t')])

In [7]:
data.Text = data.Text.apply(lambda text: '\n'.join(text))

Посчитаем несколько статистик

In [25]:
data['NonWords'] = data.Text.apply(lambda text: len(re.findall('[A-Za-z\d_]+[ \.\?\,\!]', text)))
data['Digits'] = data.Text.apply(lambda text: len(re.findall('\d+', text)))
data['NonAlNums'] = data.Text.apply(lambda text: len(re.findall('[^\w\s]+', text)))
data['URLs'] = data.Text.apply(lambda text: len(re.findall('https?://', text)))
data[data.OpenStatus == 0].URLs.describe()

count    49978.000000
mean         0.237885
std          0.662161
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max         17.000000
Name: URLs, dtype: float64

Удалим из текста не слова

In [26]:
data.Text = data.Text.apply(lambda text: re.sub('https?://\S', " ", text))
data.Text = data.Text.apply(lambda text: re.sub('[\r\t\n]+', " ", text))

In [27]:
data['Questions'] = data.Text.apply(lambda text: len(re.findall("\?", text)))
data['Exclamations'] = data.Text.apply(lambda text: len(re.findall("\!", text)))
data['Statements'] = data.Text.apply(lambda text: len(re.findall("\.", text)))

In [38]:
data['Questions_ratio'] = data['Questions'] / data['NumWords']
data['Exclamations_ratio'] = data['Exclamations'] / data['NumWords']

In [48]:
data['NonAlNums_ratio'] = data['NonWords'] / data['NumWords']

In [49]:
data[data.OpenStatus == 1].NonAlNums_ratio.describe()

count    49970.000000
mean         0.169964
std          0.027376
min          0.000000
25%          0.159091
50%          0.173913
75%          0.186528
max          0.320388
Name: NonAlNums_ratio, dtype: float64

In [50]:
data[data.OpenStatus == 0].NonAlNums_ratio.describe()

count    49960.000000
mean         0.167898
std          0.024232
min          0.000000
25%          0.156463
50%          0.171285
75%          0.183246
max          0.400000
Name: NonAlNums_ratio, dtype: float64

In [464]:
# data.Text = data.Text.apply(lambda text: re.sub("[^\w]", " ", text))
data.Text = data.Text.apply(lambda text: re.sub("\. "," . ", text))
data.Text = data.Text.apply(lambda text: re.sub("[\,\!\?] ","  ", text))
data.Text = data.Text.apply(lambda text: re.sub(" ","  ", text))
# data.Text = data.Text.apply(lambda text: "  ".join(text.split()))

In [465]:
data.Text = data.Text.apply(lambda text: re.findall(' [A-Za-z][a-z]* ', text))

Число предложений в тексте

In [43]:
data['NumSentences'] = data.Text.apply(lambda text: len(nltk.sent_tokenize(" ".join(text))))

Стеммим слова

In [466]:
data.Text = data.Text.apply(lambda text: [word.lower() for word in text])

In [467]:
stemmer = nltk.stem.porter.PorterStemmer()
data.Text = data.Text.apply(lambda text: [stemmer.stem(word) for word in text])

Длина текста в символах

In [32]:
data['NumWords'] = data.Text.apply(lambda x: len(x))

Длина кода в символах

In [28]:
data['CodeLen'] = data['Code'].apply(lambda x: len("".join(x)))

Стэммим заголовки

In [22]:
data.Title = data.Title.apply(lambda text: re.sub("[^a-z0-9\^$#@*+/\-=]", " ", text.lower()).split())

In [23]:
stemmer = nltk.stem.porter.PorterStemmer()
data.Title = data.Title.apply(lambda text: [stemmer.stem(word) for word in text])

##Информация о пользователе

In [29]:
data['PostCreationDate'] = data['PostCreationDate'].apply(lambda x: time.strptime(x[:10], "%m/%d/%Y"))
data['OwnerCreationDate'] = data['OwnerCreationDate'].apply(lambda x: x if (x != '2008-09-01') else '01/09/2008')
data['OwnerCreationDate'] = data['OwnerCreationDate'].apply(lambda x: time.strptime(x[:10], "%m/%d/%Y"))
data['OwnerCreationDate'] = data['OwnerCreationDate'].apply(lambda x: time.mktime(x) / 86400)
data['PostCreationDate'] = data['PostCreationDate'].apply(lambda x: time.mktime(x) / 86400)
data['DaysBeforePost'] = data['PostCreationDate'] - data['OwnerCreationDate']
data[data['DaysBeforePost'] < 0].DaysBeforePost = -1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


##Теги

In [30]:
data['Tags'] = list(zip(data.Tag1.fillna(""), 
                               data.Tag2.fillna(""), data.Tag3.fillna(""), 
                               data.Tag4.fillna(""), data.Tag5.fillna(""))) 
data['Tags'] = data['Tags'].apply(lambda x: (" ".join(x).lower().split()))

#Тестовая и обучающая выборки

In [46]:
train_data = data[:70000]
test_data = data[70000:]

In [47]:
train_columns = [col for col in data.columns if col != 'OpenStatus' and col != 'PostClosedDate']
x_tr = train_data[train_columns]
y_tr = train_data.OpenStatus
x_test = test_data[train_columns]
y_test = test_data.OpenStatus

#Посчитаем tf-idf

In [33]:
data.Text[0]

'In My VB.NET web page, I have this standard event. Note the "Handles" clause on teh event declaration. In my C# web app, I have this: Since C# doesn\'t have a "Handles" equivalent and from what I\'ve seen, event handlers are wired up using delegate += syntax, I was looking for this, but I could not foind it in the aspx page, aspx.cs file or the aspx.designer.cs file. In VB, I would have two drop down lists at the top of  the Code Editor window and I could select any object on the web form or the web form itself and see the possible events for the object. Selecting the event would either take me to the event handler or if it didn\'t exists, it would create the stub for me. I know that the Properties window in C# (and I think in VB, too) has an Event tab that shows the list of events for the selected object GUI object, but "Page" doesn\'t appear as an object that can be selected. 1. Where does C# define the hooking up of the event to the handler?  2.  How do I generate a stub for the Pa

In [54]:
vect = TfidfVectorizer(min_df=1/len(x_tr), max_df=0.9, lowercase=False, tokenizer=lambda doc: doc)
text_tr_tfidf = vect.fit_transform(x_tr.Text)
text_test_tfidf = vect.transform(x_test.Text)
title_tr_tfidf = vect.fit_transform(x_tr.Title)
title_test_tfidf = vect.transform(x_test.Title)
tag_tr_tfidf = vect.fit_transform(x_tr.Tags)
tag_test_tfidf = vect.transform(x_test.Tags)

In [55]:
print(tag_tr_tfidf.shape)
print(title_tr_tfidf.shape)
print(text_tr_tfidf.shape)

(70000, 13938)
(70000, 234)
(70000, 1351)


In [40]:
def log_scaler(df):
    new_df = df.copy()
    if new_df.min() < 0:
        new_df -= new_df.min()
    new_df = np.log(new_df + 1)
    return (new_df - new_df.mean())/ (new_df.max() - new_df.min())

def norm_scaler(df):
    new_df = df.copy()
    return (new_df - new_df.mean())/ (new_df.max() - new_df.min())

In [51]:
cols_norm = ['CodeLen', 'NumSentences', 'NumWords', 'NonWords', 'Digits', 'NonAlNums']
cols_log = ['DaysBeforePost', 'ReputationAtPostCreation', 'OwnerUndeletedAnswerCountAtPostTime']

x_train_2 = np.array(norm_scaler(x_tr[cols_norm[1]]))[:, None]
x_test_2 = np.array(norm_scaler(x_test[cols_norm[1]]))[:, None]
for col in cols_norm[1:]:
    x_train_2 = np.hstack((x_train_2, np.array(norm_scaler(x_tr[col]))[:, None]))
    x_test_2 = np.hstack((x_test_2, np.array(norm_scaler(x_test[col]))[:, None]))
for col in cols_log:
    x_train_2 = np.hstack((x_train_2, np.array(norm_scaler(x_tr[col]))[:, None]))
    x_test_2 = np.hstack((x_test_2, np.array(norm_scaler(x_test[col]))[:, None]))


In [58]:
x_train_2 = sps.csr_matrix(sps.hstack((x_train_2, tag_tr_tfidf)))#, title_tr_tfidf, tag_tr_tfidf)))
x_test_2 = sps.csr_matrix(sps.hstack((x_test_2, tag_test_tfidf)))#, title_test_tfidf, tag_test_tfidf)))
# x_train_2 = sps.csr_matrix(tag_tr_tfidf)
# x_test_2 = sps.csr_matrix(tag_test_tfidf)

In [59]:
clf_linear = LogisticRegression()
clf_linear.fit(x_train_2, y_tr)
predicted_y_test = clf_linear.predict_proba(x_test_2)
print(roc_auc_score(y_test, predicted_y_test[:, 1]))

0.721352747637


#Отбор слов

In [474]:
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()

Частоты в положительных примерах

In [475]:
for text in x_tr.Text[y_tr == 1]:
    for word in text:
        word_fd[word.lower()] += 1
        label_word_fd['pos'][word.lower()] += 1

Частоты в отрицательных примерах

In [476]:
for text in x_tr.Text[y_tr == 0]:
    for word in text:
        word_fd[word.lower()] += 1
        label_word_fd['neg'][word.lower()] += 1

In [477]:
pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count

In [478]:
word_scores = {}
for word, freq in word_fd.items():
    pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
        (freq, pos_word_count), total_word_count)
    neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
        (freq, neg_word_count), total_word_count)
    word_scores[word] = pos_score + neg_score

In [479]:
best = sorted(word_scores.items(), key=operator.itemgetter(1))

In [113]:
word_fd['flux']

2

In [110]:
label_word_fd['neg']['flux']

1

In [111]:
total_word_count['flux']

TypeError: 'int' object is not subscriptable

In [483]:
list(bestwords)[:100]

[' switches ',
 ' central ',
 ' thros ',
 ' teenager ',
 ' process ',
 ' normalize ',
 ' madan ',
 ' liquid ',
 ' lunched ',
 ' flour ',
 ' acocunt ',
 ' consumption ',
 ' ted ',
 ' charaters ',
 ' deffinately ',
 ' wampserver ',
 ' linke ',
 ' recordsets ',
 ' inoperative ',
 ' admit ',
 ' hows ',
 ' dealt ',
 ' transcribed ',
 ' diplaying ',
 ' listfield ',
 ' colons ',
 ' opportunistic ',
 ' flashyness ',
 ' ferda ',
 ' tmp ',
 ' schemagen ',
 ' shiro ',
 ' boned ',
 ' shifting ',
 ' gemset ',
 ' greyed ',
 ' otherways ',
 ' fart ',
 ' fallows ',
 ' unveil ',
 ' frictionless ',
 ' databased ',
 ' intrigues ',
 ' authentificator ',
 ' modernizr ',
 ' unfortunatelly ',
 ' dolphin ',
 ' looked ',
 ' invalide ',
 ' hats ',
 ' ideallyin ',
 ' combos ',
 ' aplicattion ',
 ' curios ',
 ' flagging ',
 ' vase ',
 ' logarithmic ',
 ' vale ',
 ' blades ',
 ' sore ',
 ' terabytes ',
 ' din ',
 ' dups ',
 ' pie ',
 ' heallth ',
 ' xmldom ',
 ' modularity ',
 ' respnse ',
 ' boiler ',
 ' mitigate

In [493]:
bestwords = set([w for w, s in best[:20000]])

Формируем словарь

In [494]:
vocabulary = {}
for word in list(bestwords):
    vocabulary.setdefault(word, len(vocabulary))

In [495]:
len(vocabulary)

20000

In [496]:
indptr, indices, dt = [0], [], []
for article in x_tr.Text:
    for word in article:
        if word in vocabulary.keys():
            index = vocabulary[word]
            indices.append(index)
            dt.append(1)
    indptr.append(len(indices))
text = sps.csr_matrix((dt, indices, indptr), dtype=float)

In [497]:
indptr, indices, dt = [0], [], []
for article in x_test.Text:
    for word in article:
        if word in vocabulary.keys():
            index = vocabulary[word]
            indices.append(index)
            dt.append(1)
    indptr.append(len(indices))
text_test = sps.csr_matrix((dt, indices, indptr), dtype=float, shape=(len(x_test.Text), text.shape[1]))

In [498]:
transformer = TfidfTransformer()
text_tfidf = transformer.fit_transform(text)
text_test_tfidf = transformer.fit_transform(text_test)

In [499]:
code = np.array(new_scaler(x_tr['CodeLen']))[:, None]
code_test = np.array(new_scaler(x_test['CodeLen']))[:, None]
# code_scaled = x_tr['CodeLen']
# code_scaled[code_scaled > 0] = 1
# code_test_scaled = x_tr['CodeLen']
# code_test_scaled[code_scaled > 0] = 1
num_sent = np.array(new_scaler(x_tr['NumSentences']))[:, None]
num_sent_test = np.array(new_scaler(x_test['NumSentences']))[:, None]
num_words = np.array(new_scaler(x_tr['NumWords']))[:, None]
num_words_test = np.array(new_scaler(x_test['NumWords']))[:, None]
days = np.array(new_scaler(x_tr['DaysBeforePost']))[:, None]
days_test = np.array(new_scaler(x_test['DaysBeforePost']))[:, None]
reputations = np.array(new_scaler(x_tr['ReputationAtPostCreation']))[:, None]
reputations_test = np.array(new_scaler(x_test['ReputationAtPostCreation']))[:, None]
answers = np.array(new_scaler(x_tr['OwnerUndeletedAnswerCountAtPostTime']))[:, None]
answers_test = np.array(new_scaler(x_test['OwnerUndeletedAnswerCountAtPostTime']))[:, None]

x_train_2 = np.hstack((code, num_sent, num_words, days, reputations, answers))
x_test_2 = np.hstack((code_test, num_sent_test, num_words_test, days_test, reputations_test, answers_test))
x_train_2 = sps.csr_matrix(sps.hstack((x_train_2, text_tfidf)))
x_test_2 = sps.csr_matrix(sps.hstack((x_test_2, text_test_tfidf)))

In [500]:
x_train_2.shape

(70000, 20006)

In [501]:
clf_linear = LogisticRegression()
clf_linear.fit(x_train_2, y_tr)
predicted_y_test = clf_linear.predict_proba(x_test_2)
print(roc_auc_score(y_test, predicted_y_test[:, 1]))

0.720487606391
