## Fetch articles

* poems : fetched poems from search with "poem" tag - http://trove.nla.gov.au/newspaper/result?l-publictag=poem&q&s=20 (get_poem.py), saved in /data/poems-201604022338.csv
* other article : fetched others in random from 'http://trove.nla.gov.au/newspaper/article/11XXX999', saved in data/others-201604030052.csv


## Read poem

In [1]:
import numpy as np
import pandas as pd

poems = pd.read_csv('../data/poems-201604022338.csv')
poems.head()

Unnamed: 0,title,data_h,content,newspaper,data_w,date,data_y,data_x,article_id,page
0,WANDERING THOUGHTS.,"75,35,33,36,34,35,33,35,34,34,39,34,34,27,34,3...",WANDERING THOUGHTS. [BY HENRY HALLORAN.]\nI 0]...,The Sydney Morning Herald,"749,476,226,735,710,747,736,577,617,570,701,66...",Fri 4 Feb 1876,"3352,3470,3504,3536,3569,3603,3636,3668,3701,3...","3919,3987,4054,3987,3987,3988,3988,3988,3988,3...",13364429,Page 3
1,SONNET.,"26,24,30,33,28,31,26,33,34,34,31,28,33,29,32,3...","SONNET.\nBY HENRY HALLORAN.\nI"" Colonel Baker ...",The Sydney Morning Herald,"155,321,602,567,480,577,475,578,544,547,692,52...",Mon 17 Sep 1877,"5629,5693,5725,5757,5788,5815,5845,5873,5903,5...","4664,4580,4438,4379,4381,4378,4380,4379,4379,4...",13399991,Page 2
2,FOR EVER.,"28,33,30,31,35,30,28,26,35,29,29,34,33,28,32,3...",FOR EVER.\nBy Henry Kendall.\nOvT of the Body ...,The Sydney Morning Herald,"208,346,338,447,488,380,416,360,498,437,411,44...",Tue 15 Dec 1863,"1107,1171,1215,1241,1268,1294,1348,1375,1401,1...","2562,2487,2401,2426,2400,2427,2400,2428,2399,2...",13091165,Page 8
3,WOOLLI CREEK.,"67,37,31,31,42,36,31,29,26,37,47,26,29,31,31,3...","WOOLLI CREEK.\nBy Henry Kendall.\nOne I see, w...",The Sydney Morning Herald,"863,354,367,432,396,456,479,367,440,411,494,35...",Mon 19 Dec 1864,"129,204,244,270,287,322,369,397,422,449,478,52...","4769,5055,4934,4960,4932,4958,4930,4960,4932,4...",13110473,Page 3
4,ELLEN [?]AY,"24,14,34,40,35,45,33,41,34,40,47,39,41,42,43,5...",ELLEN RAY.\nA quiet song for Ellen--\nTho pati...,The Sydney Morning Herald,"232,245,312,329,373,314,348,341,363,532,486,32...",Tue 10 Jan 1865,"5740,5758,5772,5802,5833,5865,5894,5926,5957,5...","518,516,413,443,412,442,412,441,410,442,410,44...",13111902,Page 3


## generate signals(features)

convert lines, words, x, y, w, h in html input tag into singals

In [2]:
def count_line(contents):
    if not contents: 
        return 0
    lines = contents.split('\n')
    return len(lines)

def mean_word(contents):
    if not contents: 
        return 0
    lines = contents.split('\n')
    words = [len(line.split(' ')) for line in lines]
    return np.mean(words)

def total_word(contents):
    if not contents:
        return 0
    lines = contents.split('\n')
    words = [len(line.split(' ')) for line in lines]
    return sum(words)

def std_w(data_w):
    if not data_w:
        return 0
    ws = map(int, data_w.split(','))
    return np.std(ws)

def mean_y(data_y):
    if not data_y:
        return 0
    ys = map(int, data_y.split(','))
    return np.mean([abs(ys[1:][i] - ys[:-1][i]) for i in range(0, len(ys[1:]))])

def mean_h(data_h):
    if not data_h:
        return 0
    hs = map(int, data_h.split(','))
    return np.mean(hs)

def mean_x(data_x):
    if not data_x:
        return 0
    xs = map(int, data_x.split(','))
    x_min = np.min(xs)
    return np.mean(map(lambda x:x-x_min, xs))

def get_page(page):
    return int(page.replace('Page', '').strip())

In [3]:
def convert_feature(df, target=1):
    df['count_line'] = df['content'].apply(count_line)
    df['mean_word'] = df['content'].apply(mean_word)
    df['total_word'] = df['content'].apply(total_word)
    df['std_w'] = df['data_w'].apply(std_w)
    df['mean_y'] = df['data_y'].apply(mean_y)
    df['mean_h'] = df['data_h'].apply(mean_h)
    df['mean_x'] = df['data_x'].apply(mean_x)
    df['page_num'] = df['page'].apply(get_page)
    df['target'] = target
    return df


In [4]:
convert_feature(poems)
poems.head()

Unnamed: 0,title,data_h,content,newspaper,data_w,date,data_y,data_x,article_id,page,count_line,mean_word,total_word,std_w,mean_y,mean_h,mean_x,page_num,target
0,WANDERING THOUGHTS.,"75,35,33,36,34,35,33,35,34,34,39,34,34,27,34,3...",WANDERING THOUGHTS. [BY HENRY HALLORAN.]\nI 0]...,The Sydney Morning Herald,"749,476,226,735,710,747,736,577,617,570,701,66...",Fri 4 Feb 1876,"3352,3470,3504,3536,3569,3603,3636,3668,3701,3...","3919,3987,4054,3987,3987,3988,3988,3988,3988,3...",13364429,Page 3,76,9.118421,693,106.436665,35.106667,36.013158,72.184211,3,1
1,SONNET.,"26,24,30,33,28,31,26,33,34,34,31,28,33,29,32,3...","SONNET.\nBY HENRY HALLORAN.\nI"" Colonel Baker ...",The Sydney Morning Herald,"155,321,602,567,480,577,475,578,544,547,692,52...",Mon 17 Sep 1877,"5629,5693,5725,5757,5788,5815,5845,5873,5903,5...","4664,4580,4438,4379,4381,4378,4380,4379,4379,4...",13399991,Page 2,18,7.111111,128,137.12142,31.470588,30.333333,57.5,2,1
2,FOR EVER.,"28,33,30,31,35,30,28,26,35,29,29,34,33,28,32,3...",FOR EVER.\nBy Henry Kendall.\nOvT of the Body ...,The Sydney Morning Herald,"208,346,338,447,488,380,416,360,498,437,411,44...",Tue 15 Dec 1863,"1107,1171,1215,1241,1268,1294,1348,1375,1401,1...","2562,2487,2401,2426,2400,2427,2400,2428,2399,2...",13091165,Page 8,70,6.614286,463,84.350556,33.623188,30.428571,194.7,8,1
3,WOOLLI CREEK.,"67,37,31,31,42,36,31,29,26,37,47,26,29,31,31,3...","WOOLLI CREEK.\nBy Henry Kendall.\nOne I see, w...",The Sydney Morning Herald,"863,354,367,432,396,456,479,367,440,411,494,35...",Mon 19 Dec 1864,"129,204,244,270,287,322,369,397,422,449,478,52...","4769,5055,4934,4960,4932,4958,4930,4960,4932,4...",13110473,Page 3,195,7.394872,1442,263.602731,34.865979,35.420513,108.235897,3,1
4,ELLEN [?]AY,"24,14,34,40,35,45,33,41,34,40,47,39,41,42,43,5...",ELLEN RAY.\nA quiet song for Ellen--\nTho pati...,The Sydney Morning Herald,"232,245,312,329,373,314,348,341,363,532,486,32...",Tue 10 Jan 1865,"5740,5758,5772,5802,5833,5865,5894,5926,5957,5...","518,516,413,443,412,442,412,441,410,442,410,44...",13111902,Page 3,38,5.315789,202,170.834595,32.526316,41.230769,164.666667,3,1


## Read other articles & generate signals(features)

In [6]:
others = pd.read_csv('../data/others-201604030052.csv')
# others
convert_feature(others, target=0)
others.head()

Unnamed: 0,title,data_h,content,newspaper,data_w,date,data_y,data_x,article_id,page,count_line,mean_word,total_word,std_w,mean_y,mean_h,mean_x,page_num,target
0,Family Notices,"37,34,42,39,39,40,46,46,31,34,36,38,37,31,46,6...",FUNERALS.\nTHE FRIENDS of the late Mr. JAMES H...,The Sydney Morning Herald,"257,848,837,855,852,848,850,852,852,850,850,85...",Sat 16 Sep 1871,"248,311,337,372,404,437,468,505,548,579,610,64...","687,399,411,400,401,402,401,403,405,404,401,40...",13244999,Page 12,92,7.641304,703,115.974121,33.434783,38.333333,42.354839,12,0
1,LIQUORS.,"40,30,44,22,29,27,27,27,30,29,27,28,28,27,30,2...","I LIQUORS., ."",'; 'I\nI [l\ W. C0SEN8' CIRCUXA...",The Sydney Morning Herald,"891,891,891,806,833,830,833,833,820,805,832,83...",Sat 7 Jun 1873,"4296,4336,4352,4396,4416,4444,4470,4497,4520,4...","2991,2990,2991,3047,3020,3022,3020,3020,3021,3...",13321999,Page 8,33,9.333333,308,28.421076,26.6875,29.121212,31.515152,8,0
2,Family Notices,"41,37,37,29,28,37,35,29,35,34,36,26,35,35,33,3...","In order to guard against imposition, notices\...",The Mercury,"624,598,599,294,136,624,600,284,628,601,599,20...",Sat 20 May 1893,"774,799,823,854,896,922,950,985,1006,1034,1073...","374,400,400,398,618,372,399,402,372,399,398,58...",13286999,Page 1,40,5.1,204,160.432938,29.666667,34.15,52.15,1,0
3,Advertising,"31,28,30,28,28,30,28,28,27,33,32,28,22,29,22,2...",ABSTBACT OF SALES BY AUCTION THIS DAY.!\nGIBSO...,The Sydney Morning Herald,"847,845,845,794,844,793,846,794,792,845,794,83...",Wed 20 Aug 1873,"2141,2213,2238,2267,2292,2316,2344,2371,2397,2...","2867,2868,2869,2920,2870,2920,2868,2920,2922,2...",13322999,Page 5,29,7.413793,215,185.95506,27.857143,28.965517,23.896552,5,0
4,IMPORTS.—AUGUST 7.,2636,"?. IMFORTS.-AUOUST 7.\nPrima Donna, from Tome ...",The Sydney Morning Herald,336740,Wed 8 Aug 1866,22062229,448225,13134999,Page 4,2,6.5,13,202.0,23.0,31.0,111.5,4,0


## Fetch signals(features) before applying algorithm

In [7]:
features = ['count_line', 'mean_word', 'total_word', 'std_w', 'mean_y', 'mean_h', 'mean_x', 'page_num']

In [8]:
other_x = others[features].values

In [9]:
other_x[other_x > 1000] = 1000
other_x[np.isnan(other_x)] = 0

In [10]:
poem_x = poems[features].values

In [31]:
X = np.concatenate((other_x, poem_x), axis=0)

In [12]:
other_y = others[['target']].values
poem_y = poems[['target']].values
y = np.concatenate((other_y, poem_y), axis=0)

In [13]:
y = y.flatten()

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.cross_validation import cross_val_score, KFold
from sklearn import decomposition
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.decomposition import TruncatedSVD
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

## Applying different algoriths with only newspaper format signal to check  accurcy

### 1. Applying SVC Classifier(Support Vector Machine)

In [15]:
svc = SGDClassifier()
svc.fit(X, y)
scores = cross_val_score(svc, X, y, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.61 (+/- 0.18)


### 2. Appliying Random Forest Classifier

In [16]:
forest = RandomForestClassifier()
forest.fit(X, y)
scores = cross_val_score(forest, X, y, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.88 (+/- 0.13)


### 3. Appliying Logistic Regression Classifier

In [78]:
logistic = LogisticRegression()
logistic.fit(X, y)
scores = cross_val_score(logistic, X, y, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.73 (+/- 0.21)


### 4. Appliying Naive Bayes Classifier

In [79]:
nb = MultinomialNB()
nb.fit(X, y)
scores = cross_val_score(nb, X, y, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.61 (+/- 0.17)


## Find optimization in Random Forest Classifier which is the best classifier

In [32]:
clf = Pipeline([#('pca', PCA()),
                    ('clf', RandomForestClassifier()),
                ])

parameters = {#'pca__n_components': (3, 4, 5, 6, 7),
                'clf__n_estimators': (5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 60, 70), 
                'clf__criterion': ('gini', 'entropy'),
                'clf__max_features': ('auto', 'sqrt', 'log2')
             }    

gs_clf = GridSearchCV(clf, parameters, cv=5)
gs_clf.fit(X, y)

best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
print('Score : ', score)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))


('Score : ', 0.914572864321608)
clf__criterion: 'entropy'
clf__max_features: 'sqrt'
clf__n_estimators: 35


## Find important singals  

In [22]:
clf = RandomForestClassifier(n_estimators=50, max_features='sqrt', criterion='gini')
clf.fit(X, y)

print sorted(zip(map(lambda x: round(x, 4), clf.feature_importances_), features), reverse=True)

[(0.174, 'mean_word'), (0.1538, 'page_num'), (0.1484, 'mean_x'), (0.1398, 'total_word'), (0.1358, 'count_line'), (0.1002, 'mean_y'), (0.0764, 'mean_h'), (0.0716, 'std_w')]


#### Best signals

1. mean word per line
1. newspaper page number
1. indent x

### Apply Naive Bayes algorithm on poem and other articles

In [26]:
X = np.concatenate((poems['content'].values, others['content'].values), axis=0)

vectorizer = CountVectorizer(max_df=10, min_df=1)
X = vectorizer.fit_transform(X)
X = TfidfTransformer().fit_transform(X)
  

In [27]:
nb = MultinomialNB()
nb.fit(X, y)

scores = cross_val_score(nb, X, y, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.91 (+/- 0.07)


In [28]:
clf = Pipeline([
#                 ('vect', CountVectorizer()),
#                 ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB()),
            ])
    
parameters = {
#                 'tfidf__use_idf': (True, False),
                'clf__alpha': (0,1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4), 
                'clf__fit_prior':(True, False)}    
    
    
gs_clf = GridSearchCV(clf, parameters, cv=5)

gs_clf.fit(X, y)

best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
print('Score : ', score)
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

('Score : ', 0.90954773869346739)
clf__alpha: 0.3
clf__fit_prior: True


### Combine the two best classifiers(RandomForest classifier on newspaper format and Naive Bayes on Content)