In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [11]:
import sklearn as skl
import pandas as pd

from sklearn import datasets
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline

from scipy.sparse import hstack

import time
import datetime
import math

In [12]:
pd.options.display.max_colwidth=10000
np.set_printoptions(linewidth=140,edgeitems=10)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [13]:
data = pd.read_csv('train.csv', encoding='cp1251', dtype=dict(row_id=np.int32, x=np.float32, y=np.float32, accuracy=np.int32, time=np.int32, place_id=np.int64))

In [14]:
q = data.head(37000)
q = q.drop(['PostId', 'PostCreationDate', 'OwnerUserId', 'PostClosedDate'], 1)
for i in range(0, len(q.axes[0])):
    date = q.get_value(i, 'OwnerCreationDate')
    if date.find('/') > 0:
        q.set_value(i, 'OwnerCreationDate', time.mktime(datetime.datetime.strptime(date, "%m/%d/%Y %H:%M:%S").timetuple()))
    elif len(date.split('-')[0]) > 2:
        q.set_value(i, 'OwnerCreationDate', time.mktime(datetime.datetime.strptime(date, "%Y-%m-%d").timetuple()))

## Merge tags and title into body

In [15]:
q['BodyMarkdown'] = q.BodyMarkdown.astype(str).str.cat(q.Title.astype(str), sep=' ')
q = q.drop('Title', 1)
tags = ['Tag1', 'Tag2', 'Tag3', 'Tag4', 'Tag5']
q = q.fillna(' ')
q['Tags'] = q.Tag1 + ' ' + q.Tag2 + ' ' + q.Tag3 + ' ' + q.Tag4 + ' ' + q.Tag5

q['BodyMarkdown'] = q.BodyMarkdown.astype(str).str.cat(q.Tags.astype(str), sep=' ')
q = q.drop('Tags', 1)
q = q.drop(tags, 1)
#q.head(1)
#train = q['BodyMarkdown']

In [16]:
train = q['BodyMarkdown']
count_vect = CountVectorizer()
count_vect.stop_words = 'english'
X = count_vect.fit_transform(train)
tfidf_transformer = TfidfTransformer()
X = tfidf_transformer.fit_transform(X)

In [17]:
y = array(list(q['OpenStatus']))
new = q
new = new.drop(['BodyMarkdown', 'OpenStatus'], 1)
new = new.as_matrix()

In [18]:
scaler = MinMaxScaler()
new = scaler.fit_transform(new)
c = hstack([new, X])

## Logistic regression

In [19]:
clf = LogisticRegression()
param_grid = {'penalty': ['l1'], 'C': arange(1, 3, 0.1), 'warm_start': [True], 'random_state': [42], 'solver': ['liblinear'], 'verbose': [1]}
clf = GridSearchCV(clf, param_grid, n_jobs=-1, refit=True)

clf.fit(c, y)
clf.best_score_, clf.best_params_

[LibLinear]

(0.7169189189189189,
 {'C': 1.7000000000000006,
  'penalty': 'l1',
  'random_state': 42,
  'solver': 'liblinear',
  'verbose': 1,
  'warm_start': True})

## Multinomial Naive Bayes

In [19]:
clf = MultinomialNB()
param_grid = {'alpha': arange(0, 1, 0.001), 'fit_prior': [True, False]}
clf = GridSearchCV(clf, param_grid, n_jobs=-1, refit=True)

clf.fit(X, y)
clf.best_score_, clf.best_params_

(0.68193333333333328, {'alpha': 0.068000000000000005, 'fit_prior': True})

In [105]:
test_data = pd.read_csv('test.csv', encoding='cp1251')
origin = test_data

In [107]:
test_data = test_data.drop(['PostId', 'PostCreationDate', 'OwnerUserId'], 1)
for i in range(0, len(q.axes[0])):
    date = test_data.get_value(i, 'OwnerCreationDate')
    if date.find('/') > 0:
        test_data.set_value(i, 'OwnerCreationDate', time.mktime(datetime.datetime.strptime(date, "%m/%d/%Y %H:%M:%S").timetuple()))
    elif len(date.split('-')[0]) > 2:
        test_data.set_value(i, 'OwnerCreationDate', time.mktime(datetime.datetime.strptime(date, "%Y-%m-%d").timetuple()))
        
test_data['BodyMarkdown'] = test_data.BodyMarkdown.astype(str).str.cat(test_data.Title.astype(str), sep=' ')
test_data = test_data.drop('Title', 1)
tags = ['Tag1', 'Tag2', 'Tag3', 'Tag4', 'Tag5']
test_data = test_data.fillna(' ')
test_data['Tags'] = test_data.Tag1 + ' ' + test_data.Tag2 + ' ' + test_data.Tag3 + ' ' + test_data.Tag4 + ' ' + test_data.Tag5

test_data['BodyMarkdown'] = test_data.BodyMarkdown.astype(str).str.cat(test_data.Tags.astype(str), sep=' ')
test_data = test_data.drop('Tags', 1)
test_data = test_data.drop(tags, 1)

train = test_data['BodyMarkdown']
X_test = count_vect.transform(train)
X_test = tfidf_transformer.transform(X_test)

new = test_data
new = new.drop(['BodyMarkdown'], 1)
new = new.as_matrix()

scaler = MinMaxScaler()
new = scaler.fit_transform(new)
c = hstack([new, X])

clf = LogisticRegression(penalty='l2', C= 2.1, warm_start=True, dual=True, random_state=42, solver='liblinear', verbose=1)
clf.fit(X, y)
predicted = clf.predict(X_test)



[LibLinear]

In [109]:
results = pd.DataFrame(origin,columns=['PostId']).join(pd.DataFrame(predicted,columns=['OpenStatus']))
results.to_csv('submission.csv',index=False)

In [110]:
results

Unnamed: 0,PostId,OpenStatus
0,779052,0
1,3351926,4
2,2333077,0
3,1186402,0
4,589152,0
5,1242891,0
6,924451,0
7,47528,0
8,225881,0
9,2970394,2
