### Import necessary packages

In [213]:
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline

### Connect to Target website, retrieve links for articles

In [214]:
main_url = 'https://www.theguardian.com/football'
main_request = requests.get(main_url)
main_soup = BeautifulSoup(main_request.content,'html5lib')

In [215]:
main_all_links=[]
main_item__containers = main_soup.findAll('div',attrs={'class':'fc-item__container'})
for main_item__container in main_item__containers:
    main_links = main_item__container.findAll('a')
    for main_link in main_links:
        main_all_links.append(main_link['href'])

### Remove non article links

In [216]:
main_unique_links = np.unique(np.array(main_all_links))

main_unique_links = main_unique_links.tolist()

for i in main_unique_links:
    if 'www.instagram.com' in i:
        main_unique_links.remove(i)

### Use Beautiful Soup to scrape each article for headline and text.

In [217]:
article_df = pd.DataFrame(columns=['link','articleHeadline','articleText'])
for unique_link in main_unique_links:
    article_url = unique_link
    article_request = requests.get(article_url)
    article_soup = BeautifulSoup(article_request.content,'html5lib')
    article_headline = article_soup.findAll('h1')[0].get_text()
    article_content = article_soup.findAll('p',attrs={'class':'dcr-18hrynw'})
    article_text = ''
    for text_section in article_content:
        article_text = article_text + text_section.get_text() + ' ' 
#     print(article_text)
    article_df.loc[len(article_df)] = [unique_link,article_headline,article_text]

### Remove links which are not actual articles like quiz and interactables

In [218]:
actual_articles = article_df[article_df['articleText']!='']

In [219]:
actual_article_text_list = actual_articles['articleText'].tolist()

### Convert list of article text into a Sparse Matrix

In [220]:
tfidf = TfidfVectorizer() 
csr_mat = tfidf.fit_transform(actual_article_text_list)
words = tfidf.get_feature_names()



In [221]:
type(csr_mat)

scipy.sparse.csr.csr_matrix

### Create an ML Pipeline with Truncated SVD for dimension reduction and run it through a KMeans Clustering model

In [222]:
svd = TruncatedSVD(n_components=50)
kmeans = KMeans(n_clusters=6)
pipeline = make_pipeline(svd,kmeans)

### Fit the pipeline to the article text sparse matrix and perform clustering

In [223]:
pipeline.fit(csr_mat)

labels = pipeline.predict(csr_mat)
titles = actual_articles['articleHeadline']
clustered_df = pd.DataFrame({'label': labels, 'article': titles})
clustered_df = clustered_df.sort_values(by='label',ascending=True)

### Display final clustured data

In [224]:
for label in clustered_df['label'].unique():
    print('Cluster Label: ',label)
    for article in clustered_df[clustered_df['label']==label]['article'].tolist():
        print(article)

Cluster Label:  0
Burnley v Liverpool: match preview
Tottenham v Wolves: match preview
Leicester v West Ham: match preview
Cluster Label:  1
Raheem Sterling hits hat-trick in easy Manchester City win at Norwich
Brentford held by Crystal Palace after welcoming Christian Eriksen
Neal Maupay stunner inspires Brighton to victory against struggling Watford
The Joy of Six: Leeds United v Liverpool Premier League meetings
European roundup: Bayern Munich suffer shock defeat at Bochum
Nagelsmann v Tedesco lives up to hype as Leipzig push Bayern all the way
Michael Keane helps sink Leeds to give Frank Lampard first Everton league win
Championship roundup: Mitrovic fires Fulham again as Bournemouth rally
Women’s Super League: talking points from the weekend’s action
Africa Cup of Nations review: sorrow, anger and Mané’s redemption 
Harry Kane single-handedly offers glimmer of light amid gloom at Spurs
Lessons from Concacaf qualifying: US insecurity in the cold and Canada’s surge
Athletic Bilbao s