In [68]:
import pandas as pd
import glob
import string
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords

In [69]:
# Uncomment if running for the first time

html_files = glob.glob('../data/ABC/*/*.html')
html_files[:5]

['../data/ABC/documentary/hannah-gadsby-s-nakedy-nudes.html',
 '../data/ABC/documentary/that-christmas-video-RN1911H029S00.html',
 '../data/ABC/documentary/wimbledon-kidnapping.html',
 '../data/ABC/documentary/exposed-the-ghost-train-fire.html',
 '../data/ABC/documentary/muster-dogs.html']

In [3]:
print(123)

123


In [70]:
# Uncomment if running for the first time

### Extract metadata from html files

raw_data = []

for file in html_files:
    category_name = file.split('/')[3] # extract category name from folder name

    soup = BeautifulSoup(open(file, 'rb'), 'html.parser')
    content = {}

    # extract title of article
    if soup.find('meta',  attrs={'property': 'og:title'}):
      title = soup.find('meta',  attrs={'property': 'og:title'})['content']
    else:
      title = ''

    # extract description of article
    if soup.find('meta',  attrs={'name': 'description'}):
      description = soup.find('meta',  attrs={'name': 'description'})['content']
    else:
      description = ''

    # extract article url
    if soup.find('meta',  attrs={'property': 'og:url'}):
      url = soup.find('meta',  attrs={'property': 'og:url'})['content']
    else:
      url = ''

    # extract image from the article
    if soup.find('meta', attrs={'property':'og:image'}):
        thumbnail = soup.find('meta', attrs={'property':'og:image'})['content']
    else:
        thumbnail = ''

    # put metadata extracts to a dictionary
    content = {
        'title': title,
        'description': description,
        'category_name': category_name,
        'thumbnail':thumbnail,
        'url': url
    }

    # append the article to the data
    raw_data.append(content)

raw_data

[{'title': "Hannah Gadsby's Nakedy Nudes",
  'description': 'Hannah Gadsby unravels the apparently simple practice of recreating our own nude human form. Taking a close look at one of the most enduring subjects in western art history.',
  'category_name': 'documentary',
  'thumbnail': 'https://cdn.iview.abc.net.au/thumbs/i/ac/AC1603H_5c3ec3155375e_1280.jpg',
  'url': 'https://iview.abc.net.au/show/hannah-gadsby-s-nakedy-nudes'},
 {'title': 'That Christmas',
  'description': 'Eleven poignant, funny and powerful stories about Christmas like no other, told by the people who lived through them. Some of the storytellers are familiar faces, some are new.',
  'category_name': 'documentary',
  'thumbnail': 'https://cdn.iview.abc.net.au/thumbs/i/rn/RN1911H029S00_5fbf352f4954e_1920.jpg',
  'url': 'https://iview.abc.net.au/show/that-christmas/video/RN1911H029S00'},
 {'title': 'The Wimbledon Kidnapping',
  'description': "Exploring the mysterious disappearance of Muriel McKay, the UK's first kidna

In [71]:
# Uncomment if running for the first time

### Convert dictionary to DataFrama and save csv


df = pd.DataFrame(raw_data)
df.tail()

#save to csv
df.to_csv("../data/ABC_parsed.csv", index=False, sep=';')


In [72]:
# Read the .csv file from above

df = pd.read_csv("../data/ABC_parsed.csv", sep=';')
df.tail(10)

Unnamed: 0,title,description,category_name,thumbnail,url
1528,Accidents Happen,A wickedly funny and surprisingly moving fable...,drama,https://cdn.iview.abc.net.au/thumbs/i/zw/ZW297...,https://iview.abc.net.au/show/accidents-happen...
1529,Ackley Bridge,"In a Yorkshire town, the merging of two school...",drama,https://cdn.iview.abc.net.au/thumbs/i/zw/ZW274...,https://iview.abc.net.au/show/ackley-bridge
1530,Train To Busan,"As a zombie outbreak sweeps South Korea, a fat...",drama,https://cdn.iview.abc.net.au/thumbs/i/zw/ZW286...,https://iview.abc.net.au/show/train-to-busan/v...
1531,The Warriors,"A struggling football team, the Warriors, plac...",drama,https://cdn.iview.abc.net.au/thumbs/i/ip/IP150...,https://iview.abc.net.au/show/warriors
1532,Human Touch,A woman raising money for her choir finds hers...,drama,https://cdn.iview.abc.net.au/thumbs/i/zw/ZW289...,https://iview.abc.net.au/show/human-touch/vide...
1533,Veneno,Valeria is in for the adventure of a lifetime ...,drama,https://cdn.iview.abc.net.au/thumbs/i/zw/ZW307...,https://iview.abc.net.au/show/veneno
1534,Starter For 10,The spirited coming-of-age tale of Brian Jacks...,drama,https://cdn.iview.abc.net.au/thumbs/i/zw/ZW286...,https://iview.abc.net.au/show/starter-for-10/v...
1535,The Larkins At Christmas,"The Larkins are preparing for Christmas, but t...",drama,https://cdn.iview.abc.net.au/thumbs/i/zw/ZW313...,https://iview.abc.net.au/show/larkins-at-chris...
1536,13 Assassins,When the sadistic excesses of Lord Matsudaira ...,drama,https://cdn.iview.abc.net.au/thumbs/i/zw/ZW284...,https://iview.abc.net.au/show/13-assassins/vid...
1537,Bucket,"When eccentric, free-spirited Mim tells her hi...",drama,https://cdn.iview.abc.net.au/thumbs/i/zw/ZW122...,https://iview.abc.net.au/show/bucket


In [73]:
# Check counts of each category
df.category_name.value_counts()

drama                   240
documentary             210
education               181
kids                    171
family                  155
movies                  143
comedy                  130
arts-and-culture        115
news-current-affairs    102
lifestyle                53
panel-and-discussion     38
Name: category_name, dtype: int64

In [74]:
# Preprocessing through lemmatizing using NLTK.
lemmatizer = nltk.stem.WordNetLemmatizer()

In [75]:
# Adding a column that contains both category and description
# This column
df['category_title_description'] = df['category_name'] + ' ' + df['title'] + ' ' + df['description']
df.tail(5)

# print(df.columns)  #to check if the columns are right


Unnamed: 0,title,description,category_name,thumbnail,url,category_title_description
1533,Veneno,Valeria is in for the adventure of a lifetime ...,drama,https://cdn.iview.abc.net.au/thumbs/i/zw/ZW307...,https://iview.abc.net.au/show/veneno,drama Veneno Valeria is in for the adventure o...
1534,Starter For 10,The spirited coming-of-age tale of Brian Jacks...,drama,https://cdn.iview.abc.net.au/thumbs/i/zw/ZW286...,https://iview.abc.net.au/show/starter-for-10/v...,drama Starter For 10 The spirited coming-of-ag...
1535,The Larkins At Christmas,"The Larkins are preparing for Christmas, but t...",drama,https://cdn.iview.abc.net.au/thumbs/i/zw/ZW313...,https://iview.abc.net.au/show/larkins-at-chris...,drama The Larkins At Christmas The Larkins are...
1536,13 Assassins,When the sadistic excesses of Lord Matsudaira ...,drama,https://cdn.iview.abc.net.au/thumbs/i/zw/ZW284...,https://iview.abc.net.au/show/13-assassins/vid...,drama 13 Assassins When the sadistic excesses ...
1537,Bucket,"When eccentric, free-spirited Mim tells her hi...",drama,https://cdn.iview.abc.net.au/thumbs/i/zw/ZW122...,https://iview.abc.net.au/show/bucket,"drama Bucket When eccentric, free-spirited Mim..."


In [76]:

def clean_text(x, stop_words):
  if pd.isna(x) == False:
    x = x.lower()
    x = x.translate(str.maketrans(' ', ' ', string.punctuation))
    x = x.strip()

    x = nltk.word_tokenize(x)
    x = [token for token in x if not token in stop_words]
    x = ' '.join([lemmatizer.lemmatize(w) for w in x])
  else:
    x = ''

  return x

In [77]:
# stop_words = stopwords.words('english')
stop_words = nltk.corpus.stopwords.words('english')
# print(stop_words[:10])

df['Text'] = df['category_title_description'].apply(lambda x: clean_text(x, stop_words))

In [78]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn import cluster

vectorizer = TfidfVectorizer(min_df=2, max_df=0.9, norm='l2')
X = vectorizer.fit_transform(df['Text'])
tf_idf = pd.DataFrame(data = X.toarray(), columns=vectorizer.get_feature_names())

# tf_idf   # Uncomment to display the tf_idf matrix




In [None]:
## Use silhouette score to find if there is a significantly optimal number of clusters

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

range_n_clusters = list (range(2,40))

for n_clusters in range_n_clusters:
    clusterer = KMeans(n_clusters=n_clusters)
    preds = clusterer.fit_predict(tf_idf)
    centers = clusterer.cluster_centers_

    score = silhouette_score(tf_idf, preds)

    ## Uncomment to see silhouette score for n = [2,40]
    # print("For n_clusters = {}, silhouette score is {})".format(n_clusters, score))

In [80]:
clusters = 10
kmeanModel = KMeans(n_clusters=clusters,
                    init='k-means++',
                    max_iter=3000,
                    random_state=0)
mod = kmeanModel.fit_transform(tf_idf)
df['k_means'] = kmeanModel.predict(tf_idf)

In [None]:
order_centroids = kmeanModel.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
dict = []

# Print top terms in each cluster
for i in range(clusters):
  print('%d' % i, sep='', end=', '),
  for ind in order_centroids[i, :20]:
    print(terms[ind], sep='', end=', ')
  print('')

In [82]:
# Check the size of each cluster
df.k_means.value_counts(ascending=False)

2    465
9    262
3    152
5    148
7    121
6    114
0    101
4     87
1     47
8     41
Name: k_means, dtype: int64

In [84]:
df.columns

Index(['title', 'description', 'category_name', 'thumbnail', 'url',
       'category_title_description', 'Text', 'k_means'],
      dtype='object')

In [85]:
# Save the dataframe to .csv for later use
df[['title', 'description', 'category_name', 'thumbnail', 'url', 'category_title_description', 'Text', 'k_means']].to_csv('../data/abc.csv')
