In [1]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.externals import joblib
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import AdaBoostClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.svm import LinearSVC



In [20]:
# Becuase the file is large it uses the low_memory option 
# to read the file in chunks. The result of this is that it
# it infers different column types in the different chunks,
# solve this either by reading the file in a oner with low_memory=False,
# or be explicit about the column types by setting them with dtype = {'column':type}

labelled_level2 = pd.read_csv(
    '../../../../data/labelled_level2.csv',
    low_memory=False
)

In [21]:
labelled_level2.shape

(173560, 21)

In [22]:
# Collapse down World

labelled_level2.loc[labelled_level2['level1taxon'] == 'World', 'level2taxon'] = 'world_level1'

In [23]:
# There are are 210 unique taxons remaining in level2taxon

labelled_level2['level2taxon'].nunique()

210

In [24]:
# Drop cases where there are multiple taxons applied to a content item.
# This is the simplest solution to the multiple class problem

print('Before:', labelled_level2.shape)
labelled_level2.drop_duplicates(subset=['content_id'], inplace=True, keep='first')
print('After: ', labelled_level2.shape)

Before: (173560, 21)
After:  (114048, 21)


In [7]:
# ----- PIPELINE THE DATA-PREP INSTEAD OF DOING IN MODEL DEFINITION -----
comment_pipeline = Pipeline([
                   ('vect', CountVectorizer()),
                   ('tfidf', TfidfTransformer(sublinear_tf=True, use_idf=True)),
                   ('anova', SelectPercentile())
])

text = comment_pipeline.fit_transform(labelled_level2['combined_text'],labelled_level2['level2taxon'])


  f = msb / msw


In [9]:
# ----- PREPARE TRAIN/TEST DATA -----
X_train, X_test, y_train, y_test = train_test_split(
    text, labelled_level2['level2taxon'], test_size = 0.2, random_state=1337)

In [18]:
# ----- MACHINE LEARNING MODEL -----
svc_clf = Pipeline([
                   ('svc', LinearSVC(C=10.0, dual=False, loss="squared_hinge", penalty="l2",
                                     multi_class='ovr', tol=0.01))
                   ])

In [19]:
svc_clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('svc', LinearSVC(C=10.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.01,
     verbose=0))])

In [12]:
""" Predict the test dataset using LinearSVC"""
predicted_svc = svc_clf.predict(X_test)
print('SVC correct prediction: {:4.2f}'.format(np.mean(predicted_svc == y_test)))

SVC correct prediction: 0.81


In [15]:
rep = classification_report(y_test, predicted_svc)
print(rep)
to_table(rep)

                                                                        precision    recall  f1-score   support

                                         Administrative justice reform       0.00      0.00      0.00         3
                                     Adoption, fostering and surrogacy       0.83      0.83      0.83         6
                                                 Armed Forces Covenant       0.33      0.25      0.29         4
                           Armed forces and Ministry of Defence reform       0.00      0.00      0.00         2
                         Armed forces support for activities in the UK       0.00      0.00      0.00         1
                                                      Arts and culture       0.76      0.80      0.78       270
                                        Assessing environmental impact       0.50      1.00      0.67         1
                                                                Asylum       0.50      0.75      0.60  

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


[['', 'precision', 'recall', 'f1-score', 'support'],
 ['Administrative', 'justice', 'reform', '0.00', '0.00', '0.00', '3'],
 ['Adoption,', 'fostering', 'and', 'surrogacy', '0.83', '0.83', '0.83', '6'],
 ['Armed', 'Forces', 'Covenant', '0.33', '0.25', '0.29', '4'],
 ['Armed',
  'forces',
  'and',
  'Ministry',
  'of',
  'Defence',
  'reform',
  '0.00',
  '0.00',
  '0.00',
  '2'],
 ['Armed',
  'forces',
  'support',
  'for',
  'activities',
  'in',
  'the',
  'UK',
  '0.00',
  '0.00',
  '0.00',
  '1'],
 ['Arts', 'and', 'culture', '0.76', '0.80', '0.78', '270'],
 ['Assessing', 'environmental', 'impact', '0.50', '1.00', '0.67', '1'],
 ['Asylum', '0.50', '0.75', '0.60', '4'],
 ['Attorney',
  'General',
  'guidance',
  'to',
  'the',
  'legal',
  'profession',
  '0.75',
  '0.75',
  '0.75',
  '4'],
 ['Aviation', '0.76', '0.68', '0.72', '19'],
 ['Benefits', 'entitlement', '0.00', '0.00', '0.00', '7'],
 ['Biodiversity', 'and', 'ecosystems', '0.60', '0.53', '0.56', '51'],
 ['Brexit', '0.84', '0.

In [14]:
def to_table(report):
    report = report.splitlines()
    res = []
    res.append(['']+report[0].split())
    for row in report[2:-2]:
       res.append(row.split())
    lr = report[-1].split()
    res.append([' '.join(lr[:3])]+lr[3:])
    return res

## Metadata Classifier

In [None]:
labelled_level2.columns

In [None]:
metadata = labelled_level2.filter(['document_type', 'primary_publishing_organisation', 
                                   'publishing_app', 'document_type_gp', 'locale'], axis=1)

In [None]:
meta = meta_pipeline.fit_transform(metadata2,labelled_level2['level2taxon'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    meta, labelled_level2['level2taxon'], test_size = 0.2, random_state=1337)

In [None]:
ada_clf = AdaBoostClassifier()

ada_clf.fit(X_train, y_train)

In [None]:
predicted_ada = ada_clf.predict(X_test)

In [None]:
print('Adaboost correct prediction: {:4.2f}'.format(np.mean(predicted_ada == y_test)))

In [None]:
print(classification_report(y_test, predicted_ada))

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfo_clf = RandomForestClassifier()
rfo_clf.fit(X_train, y_train)

In [None]:
predicted_rfo = rfo_clf.predict(X_test)
print('Adaboost correct prediction: {:4.2f}'.format(np.mean(predicted_rfo == y_test)))

In [None]:
print(classification_report(y_test, predicted_ada))

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
predicted_rfo = rfo_clf.predict_proba(X_test)
predicted_ada = ada_clf.predict_proba(X_test)

In [None]:
blended = pd.DataFrame(np.c_[
    predicted_rfo, predicted_ada
])

In [None]:
blended_model = LogisticRegression(random_state=1337)
blended_model.fit(blended, y_test)

In [None]:
blended_pred = blended_model.predict(blended)
class_report = classification_report(y_test, blended_pred)
print(class_report)
blended_model.get_params()

In [None]:
# Double check this with CV

scores = cross_val_score(blended_model, X_test, y_test,
                        scoring='accuracy',cv=10)

print(round(scores.mean(),3))
print(round(scores.std(),3))
#72