In [59]:
import numpy as np
import pandas as pd
import time
import re
import matplotlib.pyplot as plt
import scipy.sparse as sps
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn import cross_validation
from sklearn.cross_validation import KFold
import operator
from collections import OrderedDict
import nltk
from nltk.corpus import stopwords
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist
from importlib import reload
%matplotlib inline

In [60]:
train_path = 'train-contest.csv'
data = pd.read_csv(train_path)

In [61]:
# reasons = pd.read_csv('reason.csv')
# data['Reason'] = reasons['OpenStatus']

In [62]:
test_path = 'test-contest-second.csv'
test_data = pd.read_csv(test_path)

#Feature Extraction

In [63]:
data['Code'] = data.BodyMarkdown.apply(lambda text: " ".join([line for line in text.split('\n') if 
                                                     line.startswith('    ') or line.startswith('\t')]))
data['Text'] = data.BodyMarkdown.apply(lambda text: " ".join([line for line in text.split('\n') if not
                                                     line.startswith('    ') and not line.startswith('\t')]))

In [64]:
data['SplittedCode'] = data.Code.apply(lambda code: re.sub("[^a-z]", " ", code.lower()).split())

In [65]:
data['CodeLen'] = data.SplittedCode.apply(len)

In [66]:
test_data['Code'] = test_data.BodyMarkdown.apply(lambda text: " ".join([line for line in text.split('\n') if 
                                                     line.startswith('    ') or line.startswith('\t')]))
test_data['Text'] = test_data.BodyMarkdown.apply(lambda text: " ".join([line for line in text.split('\n') if not
                                                     line.startswith('    ') and not line.startswith('\t')]))

In [67]:
test_data['SplittedCode'] = test_data.Code.apply(lambda code: re.sub("[^a-z]", " ", code.lower()).split())

In [68]:
test_data['CodeLen'] = test_data.SplittedCode.apply(len)

##Текст

In [69]:
data['NonWords'] = data.Text.apply(lambda text: len(re.findall('[A-Za-z\d_]+[ \.\?\,\!]', text)))
data['Digits'] = data.Text.apply(lambda text: len(re.findall('\d+', text)))
data['NonAlNums'] = data.Text.apply(lambda text: len(re.findall('[^\w\s]+', text)))
data['URLs'] = data.Text.apply(lambda text: len(re.findall('https?://', text)))

In [70]:
test_data['NonWords'] = test_data.Text.apply(lambda text: len(re.findall('[A-Za-z\d_]+[ \.\?\,\!]', text)))
test_data['Digits'] = test_data.Text.apply(lambda text: len(re.findall('\d+', text)))
test_data['NonAlNums'] = test_data.Text.apply(lambda text: len(re.findall('[^\w\s]+', text)))
test_data['URLs'] = test_data.Text.apply(lambda text: len(re.findall('https?://', text)))

In [71]:
data['Text'] = data.Text.apply(nltk.word_tokenize)
data['PureText'] = data.Text.apply(lambda text: [word for word in text if word[0].isalpha()])
data['NumWords'] = data.PureText.apply(lambda x: len(x))
data.Title = data.Title.apply(nltk.word_tokenize)
data['Sentences'] = data.Text.apply(lambda x:nltk.sent_tokenize(" ".join(x)))
data['NumSentences'] = data.Sentences.apply(len)
data['Questions'] = data.Sentences.apply(lambda x: len([sent for sent in x if sent[-1]=='?']))
data['Questions_ratio'] = data.Questions / (data.NumSentences + 1)
data.Questions_ratio = data.Questions_ratio.fillna(0)
data['NumLastQuestions'] = data.Sentences.apply(lambda text:
                    [sent[-1] for sent in text[-5:]].count('?'))


In [72]:
test_data['Text'] = test_data.Text.apply(nltk.word_tokenize)
test_data['PureText'] = test_data.Text.apply(lambda text: [word for word in text if word[0].isalpha()])
test_data['NumWords'] = test_data.PureText.apply(lambda x: len(x))
test_data.Title = test_data.Title.apply(nltk.word_tokenize)
test_data['Sentences'] = test_data.Text.apply(lambda x:nltk.sent_tokenize(" ".join(x)))
test_data['NumSentences'] = test_data.Sentences.apply(len)
test_data['Questions'] = test_data.Sentences.apply(lambda x: len([sent for sent in x if sent[-1]=='?']))
test_data['Questions_ratio'] = test_data.Questions / (data.NumSentences + 1)
test_data.Questions_ratio = test_data.Questions_ratio.fillna(0)
test_data['NumLastQuestions'] = test_data.Sentences.apply(lambda text:
                    [sent[-1] for sent in text[-5:]].count('?'))


In [73]:
test_data.Questions_ratio.describe()

count    118172.000000
mean          0.216165
std           0.816060
min           0.000000
25%           0.000000
50%           0.142857
75%           0.333333
max         259.000000
Name: Questions_ratio, dtype: float64

##Пользователь

In [74]:
data['PostCreationDate'] = data['PostCreationDate'].apply(lambda x: time.strptime(x[:10], "%m/%d/%Y"))
data['OwnerCreationDate'] = data['OwnerCreationDate'].apply(lambda x: x if (x != '2008-09-01') else '01/09/2008')
data['OwnerCreationDate'] = data['OwnerCreationDate'].apply(lambda x: time.strptime(x[:10], "%m/%d/%Y"))
data['OwnerCreationDate'] = data['OwnerCreationDate'].apply(lambda x: time.mktime(x) / 86400)
data['PostCreationDate'] = data['PostCreationDate'].apply(lambda x: time.mktime(x) / 86400)
data['DaysBeforePost'] = data['PostCreationDate'] - data['OwnerCreationDate']
data[data['DaysBeforePost'] < 0].DaysBeforePost = -1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [75]:
test_data['PostCreationDate'] = test_data['PostCreationDate'].apply(lambda x: time.strptime(x[:10], "%m/%d/%Y"))
test_data['OwnerCreationDate'] = test_data['OwnerCreationDate'].apply(lambda x: x if (x != '2008-09-01') else '01/09/2008')
test_data['OwnerCreationDate'] = test_data['OwnerCreationDate'].apply(lambda x: time.strptime(x[:10], "%m/%d/%Y"))
test_data['OwnerCreationDate'] = test_data['OwnerCreationDate'].apply(lambda x: time.mktime(x) / 86400)
test_data['PostCreationDate'] = test_data['PostCreationDate'].apply(lambda x: time.mktime(x) / 86400)
test_data['DaysBeforePost'] = test_data['PostCreationDate'] - data['OwnerCreationDate']
test_data[test_data['DaysBeforePost'] < 0].DaysBeforePost = -1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


##Теги

In [76]:
data['Tags'] = list(zip(data.Tag1.fillna(""), 
                               data.Tag2.fillna(""), data.Tag3.fillna(""), 
                               data.Tag4.fillna(""), data.Tag5.fillna(""))) 
data['Tags'] = data['Tags'].apply(lambda x: (" ".join(x).lower().split()))
data['EmptyTags'] = data['Tags'].apply(lambda tagset: len(tagset))

In [77]:
test_data['Tags'] = list(zip(test_data.Tag1.fillna(""), 
                               test_data.Tag2.fillna(""), test_data.Tag3.fillna(""), 
                               test_data.Tag4.fillna(""), test_data.Tag5.fillna(""))) 
test_data['Tags'] = test_data['Tags'].apply(lambda x: (" ".join(x).lower().split()))
test_data['EmptyTags'] = test_data['Tags'].apply(lambda tagset: len(tagset))

#Тестовая и обучающая выборки

In [78]:
training_data = data[:70000]
testing_data = data[70000:]
train_columns = [col for col in data.columns if col != 'OpenStatus' and col != 'PostClosedDate']
x_tr = training_data[train_columns]
y_tr = training_data.OpenStatus
x_test = testing_data[train_columns]
y_test = testing_data.OpenStatus

In [79]:
train_columns = [col for col in data.columns if col != 'OpenStatus' and col != 'PostClosedDate' and col != 'Reason']
x_tr = data[train_columns]
y_tr = data.OpenStatus
x_test = test_data[train_columns]


In [80]:
x_tr.shape

(100000, 31)

In [81]:
x_test.shape

(118172, 31)

##TF—IDF

In [92]:
vect = TfidfVectorizer(lowercase=False, tokenizer=lambda doc: doc)
text_tr_tfidf = vect.fit_transform(x_tr.Text)
text_test_tfidf = vect.transform(x_test.Text)
title_tr_tfidf = vect.fit_transform(x_tr.Title)
title_test_tfidf = vect.transform(x_test.Title)
tag_tr_tfidf = vect.fit_transform(x_tr.Tags)
tag_test_tfidf = vect.transform(x_test.Tags)
code_tr_tfidf = vect.fit_transform(x_tr.SplittedCode)
code_test_tfidf = vect.transform(x_test.SplittedCode)

2-граммы для названия

In [93]:
x_tr.Title = x_tr.Title.apply(" ".join)
x_test.Title = x_test.Title.apply(" ".join)
bigram_vectorizer = CountVectorizer(ngram_range=(2,2), token_pattern=r'\b\w+\b', min_df=1)
bititle_tr = bigram_vectorizer.fit_transform(x_tr.Title)
bititle_test = bigram_vectorizer.transform(x_test.Title)
transformer = TfidfTransformer()
bititle_test = transformer.fit_transform(bititle_test)
bititle_tr = transformer.fit_transform(bititle_tr)
x_tr.Title = x_tr.Title.apply(lambda text: text.split())
x_test.Title = x_test.Title.apply(lambda text: text.split())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


2-граммы для текста

In [94]:
x_tr.PureText = x_tr.PureText.apply(" ".join)
x_test.PureText = x_test.PureText.apply(" ".join)
bigram_vectorizer = CountVectorizer(ngram_range=(2,2), token_pattern=r'\b\w+\b', min_df=1)
bigrams_tr = bigram_vectorizer.fit_transform(x_tr.PureText)
bigrams_test = bigram_vectorizer.transform(x_test.PureText)
transformer = TfidfTransformer()
bigrams_test = transformer.fit_transform(bigrams_test)
bigrams_tr = transformer.fit_transform(bigrams_tr)
x_tr.PureText = x_tr.PureText.apply(lambda text: text.split())
x_test.PureText = x_test.PureText.apply(lambda text: text.split())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [95]:
print(tag_tr_tfidf.shape)
print(title_tr_tfidf.shape)
print(text_tr_tfidf.shape)
print(code_tr_tfidf.shape)
print(bigrams_tr.shape)
print(bititle_tr.shape)

(100000, 16129)
(100000, 54956)
(100000, 289729)
(100000, 146285)
(100000, 1409456)
(100000, 290929)


#Шкалирование

In [96]:
def log_scaler(df):
    new_df = df.copy()
    quant = new_df.quantile(0.9)
    if new_df.min() < 0:
        new_df -= new_df.min()
    new_df[new_df > quant] = np.log(new_df[new_df > quant] - quant + 1) + quant
    return 0.5 + (new_df - new_df.mean()) / (new_df.std())

def norm_scaler(df):
    new_df = df.copy()
    return 0.5 + (new_df - new_df.mean()) / new_df.std()

#Отбор тегов, слов, биграм, ...

In [97]:
def choose_vocabulary(pos_df, neg_df, new_vocab_len):
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()
    
    for text in pos_df:
        for word in text:
            word_fd[word.lower()] += 1
            label_word_fd['pos'][word.lower()] += 1
    
    for text in neg_df:
        for word in text:
            word_fd[word.lower()] += 1
            label_word_fd['neg'][word.lower()] += 1
    
    pos_word_count = label_word_fd['pos'].N()
    neg_word_count = label_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count
    
    word_scores = {}
    for word, freq in word_fd.items():
        pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
            (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
            (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score
    
    best = sorted(word_scores.items(), key=operator.itemgetter(1))
    bestwords = set([w for w, s in best[-new_vocab_len:]])
    vocabulary = {}
    for word in list(bestwords):
        vocabulary.setdefault(word, len(vocabulary))
    return vocabulary, best

def text_to_sparse_mat(df, vocabulary, shape=None):
    indptr, indices, dt = [0], [], []
    for text in df:
        for word in text:
            if word in vocabulary.keys():
                index = vocabulary[word]
                indices.append(index)
                dt.append(1)
        indptr.append(len(indices))
    if shape is None:
        mat = sps.csr_matrix((dt, indices, indptr), dtype=float)
    else:
        mat = sps.csr_matrix((dt, indices, indptr), dtype=float, shape=shape)
    return mat

In [98]:
def choose_bigram_vocabulary(pos_df, neg_df, new_vocab_len):
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()
    
    for text in pos_df:
        bigrams_lst = list(nltk.bigrams(text))
        for bigram in bigrams_lst:
            word_fd[bigram] += 1
            label_word_fd['pos'][bigram] += 1
    
    for text in neg_df:
        bigrams_lst = list(nltk.bigrams(text))
        for bigram in bigrams_lst:
            word_fd[bigram] += 1
            label_word_fd['neg'][bigram] += 1
    
    pos_word_count = label_word_fd['pos'].N()
    neg_word_count = label_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count
    
    word_scores = {}
    for word, freq in word_fd.items():
        pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
            (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
            (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score
    
    best = sorted(word_scores.items(), key=operator.itemgetter(1))
    bestwords = set([w for w, s in best[-new_vocab_len:]])
    vocabulary = {}
    for word in list(bestwords):
        vocabulary.setdefault(word, len(vocabulary))
    return vocabulary, list(best)

def text_to_sparse_mat_bigrams(df, vocabulary, shape=None):
    indptr, indices, dt = [0], [], []
    for text in df:
        bigrams_lst = list(nltk.bigrams(text))
        for bigram in bigrams_lst:
            if bigram in vocabulary.keys():
                index = vocabulary[bigram]
                indices.append(index)
                dt.append(1)
        indptr.append(len(indices))
    if shape is None:
        mat = sps.csr_matrix((dt, indices, indptr), dtype=float)
    else:
        mat = sps.csr_matrix((dt, indices, indptr), dtype=float, shape=shape)
    return mat

##Текст

In [99]:
vocabulary, best_words = choose_vocabulary(x_tr.Text[y_tr == 1], x_tr.Text[y_tr == 0], 1000)
text_tr_mat = text_to_sparse_mat(x_tr.Text, vocabulary)
text_test_mat = text_to_sparse_mat(x_test.Text, vocabulary, shape=(x_test.shape[0], len(vocabulary)))
best_words[-10:]

[('java', 2811.93520702576),
 ('language', 3142.0871255997376),
 ('me', 3166.5549680375766),
 ('software', 3197.71838825905),
 ('learn', 3581.6487397716087),
 ('good', 3615.6163112307204),
 ('`', 4137.826665640388),
 ('programming', 4758.714800701567),
 ('you', 4788.310096527604),
 ('the', 17592.73803573673)]

In [100]:
transformer = TfidfTransformer()
small_text_tr_tfidf = transformer.fit_transform(text_tr_mat)
small_text_test_tfidf = transformer.fit_transform(text_test_mat)

##Теги

In [101]:
vocabulary, best_words = choose_vocabulary(x_tr.Tags[y_tr == 1], x_tr.Tags[y_tr == 0], 100)
tags_tr_mat = text_to_sparse_mat(x_tr.Tags, vocabulary)
tags_test_mat = text_to_sparse_mat(x_test.Tags, vocabulary, shape=(x_test.shape[0], len(vocabulary)))
best_words[-10:]

[('java', 422.1907041883227),
 ('interview-questions', 434.8354208020202),
 ('algorithm', 442.55205130741695),
 ('c', 504.77345034852806),
 ('career-development', 574.574356555618),
 ('ubuntu', 589.1049776425392),
 ('programming-languages', 621.9728027923425),
 ('linux', 969.8675520750619),
 ('books', 1147.7683700768878),
 ('php', 1580.4487033053413)]

In [102]:
transformer = TfidfTransformer()
small_tag_tr_tfidf = transformer.fit_transform(tags_tr_mat)
small_tag_test_tfidf = transformer.fit_transform(tags_test_mat)

##Название

In [103]:
vocabulary, best_words = choose_vocabulary(x_tr.Title[y_tr == 1], x_tr.Title[y_tr == 0], 500)
title_tr_mat = text_to_sparse_mat(x_tr.Title, vocabulary)
title_test_mat = text_to_sparse_mat(x_test.Title, vocabulary, shape=(x_test.shape[0], len(vocabulary)))
best_words[-10:]

[('php', 789.4289349894175),
 ('software', 900.5740261519511),
 ('books', 959.1665705520272),
 (':', 1021.7472057019033),
 ('good', 1118.9832711769616),
 ('for', 1160.4768050539485),
 ('?', 1353.1409817457416),
 ('best', 1529.93390754862),
 ('programming', 1640.3303231705308),
 ('what', 2540.445275838392)]

In [104]:
transformer = TfidfTransformer()
small_title_tr_tfidf = transformer.fit_transform(title_tr_mat)
small_title_test_tfidf = transformer.fit_transform(title_test_mat)

##Код

In [105]:
vocabulary, best_words = choose_vocabulary(x_tr.SplittedCode[y_tr == 1], x_tr.SplittedCode[y_tr == 0], 200)
code_tr_mat = text_to_sparse_mat(x_tr.SplittedCode, vocabulary)
code_test_mat = text_to_sparse_mat(x_test.SplittedCode, vocabulary, shape=(x_test.shape[0], len(vocabulary)))
best_words[-10:]

[('q', 1319.2109796448833),
 ('cout', 1359.636836483087),
 ('self', 1672.4821050780379),
 ('int', 2046.4903512333594),
 ('if', 2123.5821365560973),
 ('forced', 2597.0031314770263),
 ('id', 2644.7648953738685),
 ('n', 2908.137290794357),
 ('j', 3037.51228453655),
 ('i', 5021.5707538878605)]

In [106]:
transformer = TfidfTransformer()
small_code_tr_tfidf = transformer.fit_transform(code_tr_mat)
small_code_test_tfidf = transformer.fit_transform(code_test_mat)

##2-граммы

In [107]:
vocabulary, best_words = choose_bigram_vocabulary(x_tr.PureText[y_tr == 1], x_tr.PureText[y_tr == 0], 200000)
bigrams_tr_mat = text_to_sparse_mat_bigrams(x_tr.PureText, vocabulary, shape=(len(y_tr), len(vocabulary)))
bigrams_test_mat = text_to_sparse_mat_bigrams(x_test.PureText, vocabulary, shape=(x_test.shape[0], len(vocabulary)))
best_words[-10:]

[(('help', 'me'), 1203.470177667406),
 (('to', 'the'), 1222.446196689731),
 (('like', 'this'), 1238.7950204241063),
 (('looking', 'for'), 1269.1847261387527),
 (('I', 'have'), 1383.6003763338167),
 (('I', 'get'), 1384.891810621553),
 (('way', 'to'), 1551.5879335273673),
 (('the', 'following'), 1676.5245397444996),
 (('in', 'the'), 1738.88276968588),
 (('to', 'learn'), 2344.909982748621)]

In [108]:
transformer = TfidfTransformer()
big_bigrams_tr_tfidf = transformer.fit_transform(bigrams_tr_mat)
big_bigrams_test_tfidf = transformer.fit_transform(bigrams_test_mat)

In [109]:
vocabulary, best_words = choose_bigram_vocabulary(x_tr.PureText[y_tr == 1], x_tr.PureText[y_tr == 0], 500)
bigrams_tr_mat = text_to_sparse_mat_bigrams(x_tr.PureText, vocabulary, shape=(len(y_tr), len(vocabulary)))
bigrams_test_mat = text_to_sparse_mat_bigrams(x_test.PureText, vocabulary, shape=(x_test.shape[0], len(vocabulary)))
best_words[-10:]

[(('help', 'me'), 1203.470177667406),
 (('to', 'the'), 1222.446196689731),
 (('like', 'this'), 1238.7950204241063),
 (('looking', 'for'), 1269.1847261387527),
 (('I', 'have'), 1383.6003763338167),
 (('I', 'get'), 1384.891810621553),
 (('way', 'to'), 1551.5879335273673),
 (('the', 'following'), 1676.5245397444996),
 (('in', 'the'), 1738.88276968588),
 (('to', 'learn'), 2344.909982748621)]

In [110]:
transformer = TfidfTransformer()
small_bigrams_tr_tfidf = transformer.fit_transform(bigrams_tr_mat)
small_bigrams_test_tfidf = transformer.fit_transform(bigrams_test_mat)

In [111]:
np.sum(np.isnan(x_test['Questions_ratio']))

0

#Логистическая Регрессия

In [112]:
cols = ['CodeLen', 'NumWords', 'NumSentences', 'Questions_ratio',  'NonWords', 'Digits',
             'NonAlNums', 'EmptyTags', 'NumLastQuestions', 'ReputationAtPostCreation', #'DaysBeforePost',
             'OwnerUndeletedAnswerCountAtPostTime']
cols_norm = cols
cols_log = cols

x_train_2 = np.array(norm_scaler(x_tr[cols_norm[0]]))[:, None]
x_test_2 = np.array(norm_scaler(x_test[cols_norm[0]]))[:, None]
for col in cols_norm[1:]:
    x_train_2 = np.hstack((x_train_2, np.array(norm_scaler(x_tr[col]))[:, None]))
    x_test_2 = np.hstack((x_test_2, np.array(norm_scaler(x_test[col]))[:, None]))
for col in cols_log:
    x_train_2 = np.hstack((x_train_2, np.array(log_scaler(x_tr[col]))[:, None]))
    x_test_2 = np.hstack((x_test_2, np.array(log_scaler(x_test[col]))[:, None]))
print(np.sum(np.isnan(x_test_2)))

0


In [113]:
x_train_2 = sps.csr_matrix(sps.hstack((x_train_2, tag_tr_tfidf, title_tr_tfidf, text_tr_tfidf, code_tr_tfidf, 
                                       big_bigrams_tr_tfidf, bititle_tr)))
x_test_2 = sps.csr_matrix(sps.hstack((x_test_2, tag_test_tfidf, title_test_tfidf, text_test_tfidf, code_test_tfidf, 
                                      big_bigrams_test_tfidf, bititle_test)))

In [114]:
x_test_2.shape

(118172, 998050)

In [115]:
clf_linear = LogisticRegression()
clf_linear.fit(x_train_2, y_tr)
predicted_y_test = clf_linear.predict_proba(x_test_2)
# print(roc_auc_score(y_test, predicted_y_test[:, 1]))


In [116]:
predicted_y_test.shape

(118172, 2)

In [118]:
sample_submission = pd.read_csv('sample-submission-second.csv')

In [119]:
sample_submission.head()

Unnamed: 0,PostId,OpenStatus
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0


In [122]:
answers = test_data.copy()

In [127]:
answers['OpenStatus'] = predicted_y_test[:, 1]

In [134]:
answers.to_csv("answers.csv", columns=['PostId', 'OpenStatus'], index=None)

In [135]:
pd.read_csv('answers.csv').head()

Unnamed: 0,PostId,OpenStatus
0,1,0.898929
1,2,0.84885
2,3,0.765294
3,4,0.697102
4,5,0.626402


#Попробуем логистическую регрессию только по тексту

In [93]:
cols = ['CodeLen', 'NumWords', 'NumSentences', 'Questions_ratio',  'NonWords', 'Digits', 
             'NonAlNums', 'EmptyTags', 'NumLastQuestions']
cols_norm = cols
cols_log = cols

x_train_small = np.array(norm_scaler(x_tr[cols_norm[0]]))[:, None]
x_test_small = np.array(norm_scaler(x_test[cols_norm[0]]))[:, None]
for col in cols_norm[1:]:
    x_train_small = np.hstack((x_train_small, np.array(norm_scaler(x_tr[col]))[:, None]))
    x_test_small = np.hstack((x_test_small, np.array(norm_scaler(x_test[col]))[:, None]))
for col in cols_log:
    x_train_small = np.hstack((x_train_small, np.array(log_scaler(x_tr[col]))[:, None]))
    x_test_small = np.hstack((x_test_small, np.array(log_scaler(x_test[col]))[:, None]))
print(np.sum(np.isnan(x_test_small)))

0


In [94]:
x_train_small = sps.csr_matrix(sps.hstack((x_train_small, small_tag_tr_tfidf, small_title_tr_tfidf, small_text_tr_tfidf, 
                                       small_code_tr_tfidf, 
                                       small_bigrams_tr_tfidf)))
x_test_small = sps.csr_matrix(sps.hstack((x_test_small, small_tag_test_tfidf, small_title_test_tfidf, small_text_test_tfidf, 
                                      small_code_test_tfidf, 
                                      small_bigrams_test_tfidf)))

In [95]:
x_test_small.shape

(30000, 2318)

In [96]:
clf_linear = LogisticRegression()
clf_linear.fit(x_train_small, y_tr)
text_y_test = clf_linear.predict_proba(x_test_small)
print(roc_auc_score(y_test, text_y_test[:, 1]))

0.834042481021


In [67]:
rf = RandomForestClassifier(n_estimators=700, n_jobs=-1)
rf.fit(x_train_small, y_tr)
start = time.time()
rf_y_test = rf.predict_proba(x_test_small)
print(roc_auc_score(y_test, rf_y_test[:, 1]))
print(time.time() - start)

0.809514552368
3.766116142272949


In [68]:
for w in np.linspace(0, 1, 10):
    print(roc_auc_score(y_test, w * rf_y_test[:, 1] + (1 - w) * predicted_y_test[:, 1]))

0.851532999541
0.852873288138
0.853531964642
0.853306899873
0.851878764483
0.8488042845
0.843587614766
0.835575747674
0.824305613667
0.809514552368


#Исследование ошибок

In [46]:
def plot_results(prediction):
    reasons_lst = ['open', 'not a real question', 'off topic', 'not constructive', 'too localized']
    sizes = [len(x_test[(y_test != np.round(prediction[:, 1])) & 
                        (x_test.Reason == reason)])/len(y_test[x_test.Reason == reason])
             for reason in reasons_lst]
    for i in range(5):
        print(reasons_lst[i],':', sizes[i])

In [47]:
plot_results(predicted_y_test)

open : 0.23026575191230975
not a real question : 0.2282088122605364
off topic : 0.21756487025948104
not constructive : 0.23104227170054856
too localized : 0.22473118279569892


In [44]:
noquest = x_test[(y_test != np.round(predicted_y_test[:, 1])) & (x_test.Reason == 'not constructive')] 
noquest.index = range(len(noquest))
for i in range(10, 20): 
    print((predicted_y_test[(y_test != np.round(predicted_y_test[:, 1])) & (x_test.Reason == 'not constructive')])[i])
    print(i)
    print(noquest.ReputationAtPostCreation[i], noquest.DaysBeforePost[i])
    print(' '.join(noquest.Text[i]))
    print(noquest.Code[i])

[ 0.62586858  0.37413142]
10
1384 324.0
I 'm tearing my hair out . Apparently you ca n't just do something like I 've searched the internet and I ca n't find a simple example of how to implement the damn thing ANYWHERE . Even MSDN . It 's ridiculous .
    class Ranch<T> : IEnumerable<T>     {        // Mad code         IEnumerator<T> GetEnumerator()         {             // Hectic implementation details         }        // More mad code     }
[ 0.62586858  0.37413142]
11
3 2.0
I searched a lot but could n't find the way to measure web page loading time with iOS . In the app , I want to show certain page loading time.. Is it possible with iOS sdk or third party sdk ? Thanks

[ 0.62586858  0.37413142]
12
97 318.0
I have downloaded 'Adobe Flash builder for Force.com ' from Salesforce site and trying to get some hands on . If any one has tried it before request you to please share your experience . Is there seom kind of good tutorial available that shows step by step procedure to 



In [50]:
len(x_test[(y_test != np.round(predicted_y_test[:, 1])) & (y_test != np.round(text_y_test[:, 1]))
           & (x_test.Reason == 'not constructive')])

547

In [49]:
len(x_test[(y_test != np.round(text_y_test[:, 1])) & (x_test.Reason == 'not constructive')])

733