## Fetch articles

* poems : fetched poems from search with "poem" tag - http://trove.nla.gov.au/newspaper/result?l-publictag=poem&q&s=20 (get_poem.py), saved in /data/poems-201604022338.csv
* other article : fetched others in random from 'http://trove.nla.gov.au/newspaper/article/11XXX999', saved in data/others-201604030052.csv


## Read poem

In [49]:
import numpy as np
import pandas as pd

poems = pd.read_csv('../data/poem-austlit-20160412.csv')
poems.head()
poems.shape

(500, 10)

## generate signals(features)

convert lines, words, x, y, w, h in html input tag into singals

In [50]:
def count_line(contents):
    if not contents: 
        return 0
    lines = contents.split('\n')
    return len(lines)

def mean_word(contents):
    if not contents: 
        return 0
    lines = contents.split('\n')
    words = [len(line.split(' ')) for line in lines]
    return np.mean(words)

def total_word(contents):
    if not contents:
        return 0
    lines = contents.split('\n')
    words = [len(line.split(' ')) for line in lines]
    return sum(words)

def std_w(data_w):
    if not data_w:
        return 0
    ws = map(int, data_w.split(','))
    return np.std(ws)

def mean_y(data_y):
    if not data_y:
        return 0
    ys = map(int, data_y.split(','))
    return np.mean([abs(ys[1:][i] - ys[:-1][i]) for i in range(0, len(ys[1:]))])

def mean_h(data_h):
    if not data_h:
        return 0
    hs = map(int, data_h.split(','))
    return np.mean(hs)

def mean_x(data_x):
    if not data_x:
        return 0
    xs = map(int, data_x.split(','))
    x_min = np.min(xs)
    return np.mean(map(lambda x:x-x_min, xs))

def get_page(page):
    return int(page.replace('Page', '').strip())

In [51]:
def convert_feature(df, target=1):
    df['count_line'] = df['content'].apply(count_line)
    df['mean_word'] = df['content'].apply(mean_word)
    df['total_word'] = df['content'].apply(total_word)
    df['std_w'] = df['data_w'].apply(std_w)
    df['mean_y'] = df['data_y'].apply(mean_y)
    df['mean_h'] = df['data_h'].apply(mean_h)
    df['mean_x'] = df['data_x'].apply(mean_x)
    df['page_num'] = df['page'].apply(get_page)
    df['target'] = target
    return df


In [52]:
convert_feature(poems)
print poem.shape
poems.head()

Unnamed: 0,title,data_h,content,newspaper,data_w,date,data_y,data_x,article_id,page,count_line,mean_word,total_word,std_w,mean_y,mean_h,mean_x,page_num,target
0,Poets Corner,"144,308,78,30,42,36,36,39,37,40,38,36,38,38,39...",^^^i\ne^5\nLu ? r\nMAD POETS.\nf1' Some people...,The Central Queensland Herald,"349,576,451,249,669,713,474,588,476,489,390,65...",Thu 2 Jan 1936,"788,484,740,1048,1099,1139,1180,1232,1272,1310...","264,912,912,284,96,50,53,51,89,53,93,54,94,54,...",70348715,Page 8,48,5.729167,275,108.055636,59.340426,46.583333,87.979167,8,1
1,CORRESPONDENCE,"61,61,48,39,32,43,39,41,38,39,33,39,36,40,39,4...",CORRESPONDENCE\nVALE OPOSSUM^; :' ' ;':\n(To t...,The Central Queensland Herald,"505,454,370,679,439,679,498,679,255,677,411,67...",Thu 23 Jul 1931,"3185,3333,3455,3532,3573,3609,3652,3690,3730,3...","1195,1278,1294,1122,1197,1124,1160,1121,1196,1...",70280540,Page 47,45,4.711111,212,165.352278,49.795455,39.355556,76.6,47,1
2,The Worshipper.,"55,37,40,32,34,30,30,35,35,35,28,30,34,30,32,2...",The Worshipper.\nI WOULD not nurse a lovely th...,The Brisbane Courier,"355,621,164,438,618,257,427,614,583,616,166,46...",Sat 4 Jan 1930,"5315,5462,5486,5523,5553,5584,5613,5670,5700,5...","535,396,396,430,397,492,427,398,429,399,492,42...",21502619,Page 20,45,4.844444,218,185.748929,37.045455,32.333333,43.2,20,1
3,THE LOST MERCHANT.,"41,56,35,29,28,26,27,26,27,25,26,25,27,26,25,2...","TUB LOST MKRCHAXT. I\nOa "" HOPES THAT ARE BRIG...",Empire,"795,1006,927,896,890,519,605,561,602,522,570,5...",Sat 30 Jun 1860,"1243,1286,1343,1373,1397,1424,1448,1472,1495,1...","5000,4789,4869,4869,4871,4871,4873,4869,4870,4...",60412555,Page 4,60,7.816667,469,126.620969,25.152542,27.116667,93.2,4,1
4,ENIGMA.—No. IV.,"32,38,36,37,38,32,38,38,37,35,38,39,41,37,33,3...",ENIGMA.— No. IV.\nI liave a dear partner for l...,Bathurst Free Press,"320,471,556,591,650,412,570,577,687,443,550,55...",Sat 21 Sep 1850,"3425,3491,3522,3554,3584,3653,3682,3714,3746,3...","633,423,458,425,458,420,450,424,458,422,457,41...",62215013,Page 6,20,6.35,127,118.589744,42.157895,38.4,100.95,6,1


## Read other articles & generate signals(features)

In [53]:
# others
others = pd.read_csv('../data/others-201604120925.csv')
convert_feature(others, target=0)
others.shape

(493, 19)

## Fetch signals(features) before applying algorithm

In [54]:
features = ['count_line', 'mean_word', 'total_word', 'std_w', 'mean_y', 'mean_h', 'mean_x', 'page_num']

In [55]:
other_x = others[features].values

In [56]:
other_x[other_x > 1000] = 1000
other_x[np.isnan(other_x)] = 0

In [57]:
poem_x = poems[features].values

In [58]:
X = np.concatenate((other_x, poem_x), axis=0)

In [59]:
other_y = others[['target']].values
poem_y = poems[['target']].values
y = np.concatenate((other_y, poem_y), axis=0)

In [62]:
y = y.flatten()
print X.shape, y.shape

(993L, 8L) (993L,)


In [63]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.cross_validation import cross_val_score, KFold
from sklearn import decomposition
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.decomposition import TruncatedSVD
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

## Applying different algoriths with only newspaper format signal to check  accurcy

### 1. Applying SVC Classifier(Support Vector Machine)

In [64]:
svc = SGDClassifier()
svc.fit(X, y)
scores = cross_val_score(svc, X, y, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.58 (+/- 0.13)


### 2. Appliying Random Forest Classifier

In [65]:
forest = RandomForestClassifier()
forest.fit(X, y)
scores = cross_val_score(forest, X, y, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.91 (+/- 0.01)


### 3. Appliying Logistic Regression Classifier

In [66]:
logistic = LogisticRegression()
logistic.fit(X, y)
scores = cross_val_score(logistic, X, y, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.80 (+/- 0.06)


### 4. Appliying Naive Bayes Classifier

In [67]:
nb = MultinomialNB()
nb.fit(X, y)
scores = cross_val_score(nb, X, y, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.64 (+/- 0.09)


## Find optimization in Random Forest Classifier which is the best classifier

In [68]:
clf = Pipeline([#('pca', PCA()),
                    ('clf', RandomForestClassifier()),
                ])

parameters = {#'pca__n_components': (3, 4, 5, 6, 7),
                'clf__n_estimators': (5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 60, 70), 
                'clf__criterion': ('gini', 'entropy'),
                'clf__max_features': ('auto', 'sqrt', 'log2')
             }    

gs_clf = GridSearchCV(clf, parameters, cv=5)
gs_clf.fit(X, y)

best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
print('Score : ', score)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))


('Score : ', 0.93353474320241692)
clf__criterion: 'gini'
clf__max_features: 'auto'
clf__n_estimators: 70


## Find important singals  

In [69]:
clf = RandomForestClassifier(n_estimators=50, max_features='sqrt', criterion='gini')
clf.fit(X, y)

print sorted(zip(map(lambda x: round(x, 4), clf.feature_importances_), features), reverse=True)

[(0.2245, 'page_num'), (0.1505, 'mean_y'), (0.1357, 'mean_h'), (0.1252, 'mean_word'), (0.1182, 'total_word'), (0.0954, 'count_line'), (0.0934, 'mean_x'), (0.0573, 'std_w')]


#### Best signals

1. mean word per line
1. newspaper page number
1. indent x

### Apply Naive Bayes algorithm on poem and other articles

In [70]:
X = np.concatenate((poems['content'].values, others['content'].values), axis=0)

vectorizer = CountVectorizer(max_df=10, min_df=1)
X = vectorizer.fit_transform(X)
X = TfidfTransformer().fit_transform(X)
  

In [71]:
nb = MultinomialNB()
nb.fit(X, y)

scores = cross_val_score(nb, X, y, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.91 (+/- 0.03)


In [76]:
clf = Pipeline([
#                 ('vect', CountVectorizer()),
#                 ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB()),
            ])
    
parameters = {
#                 'tfidf__use_idf': (True, False),
                'clf__alpha': (0,1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4), 
                'clf__fit_prior':(True, False)}    
    
    
gs_clf = GridSearchCV(clf, parameters, cv=5)

gs_clf.fit(X, y)

best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
print('Score : ', score)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

('Score : ', 0.91339375629405839)
clf__alpha: 1
clf__fit_prior: True


### Combine the two best classifiers(RandomForest classifier on newspaper format and Naive Bayes on Content)

In [77]:
from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.externals import six
from sklearn.base import clone
from sklearn.pipeline import _name_estimators

import numpy as np
import operator
    
    
    

In [78]:
class MajorityVoteClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, classifiers, weights=None):
        self.classifiers = classifiers
        self.named_classifiers = {key: value for key, value in _name_estimators(classifiers)}
        print(self.named_classifiers)
        self.weights = weights
    
    def fit(self, X, y):
        self.classifiers_ = []
        for clf in self.classifiers:
            
            fitted_clf = clone(clf).fit(X, y)
            self.classifiers_.append(fitted_clf)
        return self

    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)
    
    def predict_proba(self, X):
        probas = np.asarray([cls.predict_proba(X) for clf in self.classifiers_])
        avg_proba = np.average(probas, axis=0, weights=self.weights)
        return avg_proba
    
    def get_params(self, deep=True):
        if not deep:
            return super(MajorityVoteClassifier, self).get_params(deep=False)
        else:
            out = self.named_classifiers.copy()
            for name, step in six.iteritems(self.named_classifiers):
                for key, value in six.iteritems(step.get_params(deep=True)):
                    out['%s__%s' % (name, key)] = value
            return out