#### Scraping and pre-processing a corpus of news article from a set of web-pages and evaluating the performance of automated classification of these articles in a supervised learning context. 

## Part 1

## Data Collection

### Imports

In [2]:
import numpy as np
import pandas as pd
import sklearn
import nltk
import scipy
import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

### Retrieving the monthly news URL's from the website

In [49]:
r  = requests.get("http://mlg.ucd.ie/modules/COMP41680/archive/index.html")

data = r.text

soup = BeautifulSoup(data, "lxml")
monthly_news = []

lists = soup.find_all('li')
    
for link in lists:
    monthly_news.append(link.find("a").get('href'))

print("Scraped %d months news lists" % len(monthly_news) )

Scraped 12 months news lists


### Retrieving news urls and category lables for each month

In [50]:
articles = {} # Creating a dictionary to store articles and category

for month in monthly_news:
    
    r  = requests.get("http://mlg.ucd.ie/modules/COMP41680/archive/" + month)

    data = r.text

    soup = BeautifulSoup(data, "lxml")
    
    
    news_list = soup.find('tbody').find_all('tr')

    
    for news in news_list:
        category_td = news.find("td", {"class": "category"})
        category = category_td.text.strip() #Removing the escape characters from category
                
        if (category != 'N/A'):
            news_links = news.find("td", {"class": "title"})
            news_href = news_links.find("a").get('href')
            
            article_link = "http://mlg.ucd.ie/modules/COMP41680/archive/" + news_href
            
            r  = requests.get(article_link)

            data = r.text

            soup = BeautifulSoup(data, "lxml") 
            
            article_text = ''
            article = soup.find("div", {"class":"main"}).findAll('p')
            for element in article:
                article_text += ' '.join(element.findAll(text = True))
                
            
            article_text = article_text.replace('"', '\\"')
            articles[article_text] = category  

print("Scraped %d news corpus" % len(articles) )
    

Scraped 1394 news corpus


### Storing the articles to a csv file

In [46]:
fout = "articles.csv"
fo = open(fout, "w")

fo.write('Article|Category\n')
for k, v in articles.items():
    fo.write(str(k) + '|'+ str(v) + '\n')

fo.close()

## Part 2

## Text Classification

### Imports

In [98]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics


### Reading csv file into a dataframe

In [53]:
raw_data = pd.read_csv("articles.csv", delimiter="|",engine='python')
raw_data.head()

Unnamed: 0,Article,Category
0,The sporting industry has come a long way sinc...,technology
1,Asian quake hits European sharesShares in Euro...,business
2,BT is offering customers free internet telepho...,technology
3,Barclays shares up on merger talkShares in UK ...,business
4,England centre Olly Barkley has been passed fi...,sport


In [63]:
articles_list = raw_data['Article'].tolist() #Getting news corpus to data
print(len(articles_list))

1394


### Text Pre-processing and Term Weighting

To process the corpuses we will be following these steps:
1. Splitting raw text to tokens (Tokenization)
2. Converting all text to lower case
3. Removing short terms and stop words
4. Stem/Lemmatise tokens
5. Filter out infrequent terms
6. Giving weights to terms
7. Creating a document term matrix

#### Text Preprocessing

By default Scikit-learn converts tokens to lowercase and removes
tokens of length 1 (i.e. single letters).

In [76]:
vectorizer = CountVectorizer(stop_words="english",  min_df = 5) # Preprocessing data to remove stop words and filter out terms appearing in less than 5 documents 
X = vectorizer.fit_transform(articles_list)

print(filtered_tokens.shape)

(1394, 6703)


This process also build a vocabulary for the corpus, both in the form of a list and in the form of a dictionary:

In [77]:
terms = vectorizer.get_feature_names()
vocab = vectorizer.vocabulary_
print("Vocabulary has %d distinct terms" % len(terms))

Vocabulary has 6703 distinct terms


Display some sample terms:

In [78]:
print(terms[500:530])

['appointed', 'appointment', 'appreciate', 'approach', 'approached', 'appropriate', 'approval', 'approvals', 'approve', 'approved', 'april', 'arabia', 'arbitration', 'arcade', 'arch', 'architecture', 'arcy', 'area', 'areas', 'aren', 'arena', 'argentina', 'argentine', 'arguably', 'argue', 'argued', 'argues', 'arguing', 'argument', 'arguments']


To use NLTK lemmatisation with Scikit-learn, we need to create a custom tokenisation function:

In [75]:
# define the function
def lemma_tokenizer(text):
    # use the standard scikit-learn tokenizer first
    standard_tokenizer = CountVectorizer().build_tokenizer()
    tokens = standard_tokenizer(text)
    # then use NLTK to perform lemmatisation on each token
    lemmatizer = nltk.stem.WordNetLemmatizer()
    lemma_tokens = []
    for token in tokens:
        lemma_tokens.append( lemmatizer.lemmatize(token) )
    return lemma_tokens

Now we can use our custom tokenizer with the standard CountVectorizer approach:

In [79]:
vectorizer = CountVectorizer(stop_words="english",min_df = 5,tokenizer=lemma_tokenizer)
X = vectorizer.fit_transform(articles_list)
print(X.shape)

(1394, 6010)


In [80]:
print(list(vectorizer.vocabulary_.keys())[0:35])

['sporting', 'industry', 'ha', 'come', 'long', 'way', '60', 'niche', 'root', 'deep', 'sport', 'showing', 'sign', 'decline', 'time', 'soon', 'later', 'reason', 'seemingly', 'difference', 'customer', 'fan', 'leader', 'ownership', 'group', 'king', 'explained', 'face', 'ceo', 'business', 'dying', 'position', 'passion', 'certainly', 'going']


#### Term Weighting

We can improve the usefulness of the document-term matrix by giving higher weights to more "important" terms.
The most common normalisation is term frequency–inverse document frequency (TF-IDF). In Scikit-learn, we can generate at TF-IDF weighted document-term matrix by using TfidfVectorizer() in place of CountVectorizer().

In [85]:
# we can pass in the same preprocessing parameters
vectorizer = TfidfVectorizer(stop_words="english",min_df = 5,tokenizer=lemma_tokenizer)
X = vectorizer.fit_transform(articles_list)
# display some sample weighted values
print(X.shape)

(1394, 6010)


#### Document Term Matrix

We can use the same Scikit-learn functionality to create a document-term matrix with N-grams.

In [84]:
# we can pass in the same preprocessing parameters and also We specify an extra parameter ngram_range which specifies the shortest and longest token sequences to include.
vectorizer = TfidfVectorizer(stop_words="english",min_df = 5,tokenizer=lemma_tokenizer,ngram_range = (1,2))
document_matrix = vectorizer.fit_transform(articles_list)
# display some sample weighted values
print(document_matrix.shape)

(1394, 9600)


As we can see this approach significantly increases the size of the vocabulary for a given corpus.

### Text Classification

Before we create classifiers we need to divide the dataset into training and test documents

In [87]:
X = raw_data['Article'].tolist() #Articles
Y = raw_data['Category'].tolist() # Labels

In [90]:
X_train_plus_valid, X_test, y_train_plus_valid, y_test \
    = train_test_split(X, Y, random_state=0, \
                                    train_size = 0.7)

X_train, X_valid, y_train, y_valid \
    = train_test_split(X_train_plus_valid, \
                                        y_train_plus_valid, \
                                        random_state=0, \
                                        train_size = 0.5/0.7)



Using the previous vectorizer to create a document-matrix of training data

In [91]:
train_X = vectorizer.fit_transform(X_train)

In [103]:
model = KNeighborsClassifier(n_neighbors=3)
model.fit(train_X, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [104]:
test_X = vectorizer.transform(X_test)

In [106]:
predicted = model.predict(test_X)

In [108]:
# Print performance details
# accuracy = metrics.accuracy_score(y_train, predicted) # , normalize=True, sample_weight=None
# print("Accuracy: " +  str(accuracy))
# print(metrics.classification_report(y_train, y_pred))