In [2]:
# Pandas
import pandas as pd

# Sklearn
import sklearn
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# NLTK
import nltk
from nltk.stem.porter import PorterStemmer
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
#nltk.download()


print('The pandas version is {}.'.format(pd.__version__))
print('The nltk version is {}.'.format(nltk.__version__))
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The pandas version is 0.25.1.
The nltk version is 3.4.5.
The scikit-learn version is 0.21.3.


In [15]:
# Load cleaned csv file
articles_df = pd.read_csv("abc_articles_df.csv", sep='|', encoding='utf-16')
articles_df.head()

Unnamed: 0,uri,article_category,article_text,description
0,/news/2021-11-24/afl-carlton-ceo-brian-cook-te...,sport,Carlton chief executive Brian Cook has tested...,Carlton AFL boss returns positive COVID test
1,/news/2021-11-24/afl-national-draft-kangaroos-...,sport,Outstanding South Australian prospect Jason H...,Kangaroos take Horne-Francis with number one s...
2,/news/2021-11-24/shaun-murphy-amateurs-not-pla...,sport,Professional snooker player Shaun Murphy says...,Former champion says amateur snooker players s...
3,/news/2021-11-24/teen-star-sophie-dwyer-headli...,sport,Giants goal-attack Sophie Dwyer has been elev...,Rising star Sophie Dwyer earns call-up to Aust...
4,/news/2021-11-24/wbbl-brisbane-heat-vs-adelaid...,sport,Adelaide Strikers spinner Amanda-Jade Welling...,Adelaide Strikers stay alive in WBBL finals wi...


In [13]:
# Show counts and description
articles_df.groupby(['article_category']).count()

Unnamed: 0_level_0,uri,article_text,description
article_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
business,118,112,118
sport,117,116,117


# Preprocessing
Tasks for both NLP

Tasks include:
    - 




In [16]:
# Combine columns and drop uncessary columns

articles_df.dropna(inplace=True) # Drop NA's

# Combine text and description into 1 column
articles_df['article_text_description'] = articles_df['article_text'].fillna('') + articles_df['description'].fillna('')

articles_df.drop(columns=['description','article_text'], inplace=True) # Drop unnecesarry columns

articles_df['bag_of_words'] = articles_df['article_text_description'] # Bag of words for further processing later


# Show counts and description
articles_df.groupby(['article_category']).count()


Unnamed: 0_level_0,uri,article_text_description,bag_of_words
article_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
business,112,112,112
sport,116,116,116


In [17]:
# Fix unicode issues
def fix_unicode_issues(df):  
    df['article_text_description'] = df['article_text_description'].replace(u'â€”', u' ')
    df['article_text_description'] = df['article_text_description'].replace(u'â€“', u' ')  
    
    return df

articles_df = articles_df.apply(fix_unicode_issues, axis=1)

# NLP Task 1

NLP Task 1 will be a classificaction ML task to classify arcles between sport and business.

In [18]:
# Stemming, Lemmating and Cleaning Bag of Words

def nltk_remove_punc_numeric(df_column):
    # Function to remove punctuation and numeric values from bag of words column
    df_column = df_column.split(' ')
    output=[word.lower() for word in df_column if word.isalpha()]
    return ' '.join(output)  
    
def nltk_stemming(df_column):
    # Function to stem words in bag of words column
    df_column = df_column.split(' ')
    stemmer = PorterStemmer() 
    output = [stemmer.stem(word) for word in df_column]
    return ' '.join(output) 

def nltk_lemmatize(df_column):
    # Function to lemmatize words in bag of words column
    df_column = df_column.split(' ')
    lemmatizer = WordNetLemmatizer()
    output = [lemmatizer.lemmatize(word) for word in df_column]
    return ' '.join(output) 
            
def nltk_remove_stopwords(df_column):
    # Function to remove stopwords from bag of words column
    df_column = df_column.split(' ')
    stopWords = set(stopwords.words('english'))
    output=[word.lower() for word in df_column if not word in stopWords]
    return ' '.join(output)  

def run_all_nltk_cleaning(df):    
    # Function to run all NLTK cleaning
    df_prepped = df
    print('Removing Punctuation and Numbers...')
    df_prepped['bag_of_words'] = df['bag_of_words'].apply(nltk_remove_punc_numeric)
    print('Removing Stopwords...')
    df_prepped['bag_of_words'] = df['bag_of_words'].apply(nltk_remove_stopwords)
    print('Applying Lemmatizing...')
    df_prepped['bag_of_words'] = df['bag_of_words'].apply(nltk_lemmatize)
    print('Applying Stemming...')
    df_prepped['bag_of_words'] = df['bag_of_words'].apply(nltk_stemming)
    
    return df_prepped

articles_df  = run_all_nltk_cleaning(articles_df)
articles_df.head(10)

Removing Punctuation and Numbers...
Removing Stopwords...
Applying Lemmatizing...
Applying Stemming...


Unnamed: 0,uri,article_category,article_text_description,bag_of_words
0,/news/2021-11-24/afl-carlton-ceo-brian-cook-te...,sport,Carlton chief executive Brian Cook has tested...,carlton chief execut brian cook test posit for...
1,/news/2021-11-24/afl-national-draft-kangaroos-...,sport,Outstanding South Australian prospect Jason H...,outstand south australian prospect jason selec...
2,/news/2021-11-24/shaun-murphy-amateurs-not-pla...,sport,Professional snooker player Shaun Murphy says...,profession snooker player shaun murphi say ama...
3,/news/2021-11-24/teen-star-sophie-dwyer-headli...,sport,Giants goal-attack Sophie Dwyer has been elev...,giant sophi dwyer elev diamond tour invite sen...
4,/news/2021-11-24/wbbl-brisbane-heat-vs-adelaid...,sport,Adelaide Strikers spinner Amanda-Jade Welling...,adelaid striker spinner wellington produc best...
5,/news/2021-11-25/23-under-23-a-league-women-pl...,sport,Australia's top professional women's competit...,top profession competit long greenhous produc ...
6,/news/2021-11-25/a-league-women-lisa-de-vanna-...,sport,Matildas great Lisa De Vanna will chase anoth...,matilda great lisa de vanna chase anoth titl s...
7,/news/2021-11-25/afl-draft-live-blog-second-ro...,sport,Fremantle wasted no time claiming Matt Johnso...,fremantl wast time claim matt johnson kick sec...
8,/news/2021-11-25/australians-josh-giddey-patty...,sport,Young Australian Josh Giddey has again flirte...,young australian josh giddey flirt creat nba b...
9,/news/2021-11-25/candice-warner-weighs-in-on-t...,sport,"Candice Warner says she is ""concerned"" about ...",candic warner say cricket stanc allow tim pain...


## Encode Categorical Target Variable for Classification

In [23]:
# Encode 2 classes from 'Article Category' as Binary encoding
articles_df['article_category'].astype('category')

lb = preprocessing.LabelBinarizer() # Create binarizer

# Fit binarizer to category variable, transfrom variable and store to new variable
articles_df['article_category_bin'] = lb.fit_transform(articles_df['article_category'])

articles_df.head()


Unnamed: 0,uri,article_category,article_text_description,bag_of_words,article_category_bin
0,/news/2021-11-24/afl-carlton-ceo-brian-cook-te...,sport,Carlton chief executive Brian Cook has tested...,carlton chief execut brian cook test posit for...,1
1,/news/2021-11-24/afl-national-draft-kangaroos-...,sport,Outstanding South Australian prospect Jason H...,outstand south australian prospect jason selec...,1
2,/news/2021-11-24/shaun-murphy-amateurs-not-pla...,sport,Professional snooker player Shaun Murphy says...,profession snooker player shaun murphi say ama...,1
3,/news/2021-11-24/teen-star-sophie-dwyer-headli...,sport,Giants goal-attack Sophie Dwyer has been elev...,giant sophi dwyer elev diamond tour invite sen...,1
4,/news/2021-11-24/wbbl-brisbane-heat-vs-adelaid...,sport,Adelaide Strikers spinner Amanda-Jade Welling...,adelaid striker spinner wellington produc best...,1


## Create Feature Vector

In [25]:
# TF-IDF

# Create TF-IDF vectorizer
tfidf_vec = TfidfVectorizer(max_features=1000, ngram_range=(1, 2)) 

# Fit and transform to 'bag of words' and convert to nd.array 
X = tfidf_vec.fit_transform(articles_df['bag_of_words']).toarray() # Fit and transform to 'bag of words' and convert to nd.array 

print(X.shape) # Show X shape

#Create target variable as nd.array
y = articles_df['article_category_bin'].values 


print(y.shape)# Show X shape


(228, 1000)
(228,)


## Training Test Split

In [26]:
# Split population in test and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.10, random_state = 0)


## Models

### Training

In [38]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators=100, random_state=0)#, max_depth=2

classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [34]:
#Import Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB

#Create a Gaussian Classifier
classifier = GaussianNB()

# Train the model using the training sets
classifier.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

### Evaluation

In [39]:
# Perform 5 fold cross validation
from sklearn.model_selection import cross_val_score

# Create 5 folds
fold_accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 5)

# 5 fold cross validation results
print("Accuracy: {:.2f}%".format(fold_accuracies.mean()*100))
print("Standard Deviation: {:.2f}%".format(fold_accuracies.std()*100))

Accuracy: 95.61%
Standard Deviation: 3.24%


In [40]:
# Measure performance against test set
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
print("Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred)*100))


[[12  0]
 [ 0 11]]
Accuracy: 100.00%


In [41]:
# Add visualisation

print(y_test)
print(y_pred)

tn, fp, fn, tp = cm.ravel()
(tn, fp, fn, tp)

[0 1 1 0 0 1 1 1 1 0 1 0 0 1 0 0 1 1 0 0 0 1 0]
[0 1 1 0 0 1 1 1 1 0 1 0 0 1 0 0 1 1 0 0 0 1 0]


(12, 0, 0, 11)

# NLP Task 2


NLP Task 2 will be a NER model to extra the names of entities in the corpus. A secondary step will be to match it to match ASX metadata using fuzzy wuzzy.

In [None]:

a = '/news/2021-11-26/shopping-black-friday-cyber-monday-deals-discounts-retail-sales/100653396'
articles_df[articles_df['uri']==a].head(1)['article_text_description'].values



In [49]:
# Filter business articles only
business_articles_df = articles_df[articles_df['article_category']=='business'].copy()

In [50]:
# Extract Potential Named Entities 
def get_potential_entities(df_row):
    # Input is a whole dataframe row. Expects 'article_text_description' variable to be present
    
    potential_entities = set() # Output variable
    document = df_row['article_text_description']
    
    # Get Parts of Speech using NLTK
    sentences = nltk.sent_tokenize(document)
    sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    sentences = [nltk.pos_tag(sentence) for sentence in sentences]
    
    # Chunking
    chunck_pos_regex = """NP: {<NN.>*<NNP.?>+<NN.>*}""" # Identify proper nouns 
    chunck_parser = nltk.RegexpParser(chunck_pos_regex) # Create Parser
    
    # Loop through each sentence in text, parse chunks in sentence and store into temp variable
    for sentence in sentences:
        result = chunck_parser.parse(sentence)
        
        # Filter for proper noun chunks only
        subtrees = result.subtrees(filter=lambda t: t.label() == 'NP')
        named_words = None
        for subtree in subtrees:        
            named_words = [word[0] for word in subtree.leaves()]
            named_words = ' '.join(named_words)
        if named_words is not None: potential_entities.add(named_words)

    return potential_entities




business_articles_df['potential_entities'] = business_articles_df.apply(get_potential_entities, axis=1)





In [51]:

business_articles_df.head()


Unnamed: 0,uri,article_category,article_text_description,bag_of_words,article_category_bin,potential_entities
25,/news/2021-11-26/nats-egypt-unveils-renovated-...,business,Egyptian authorities have unveiled a renovate...,egyptian author unveil renov ancient promenad ...,0,"{Thursday, BC, Cairo, Luxor, Nile, COVID-19, C..."
29,/news/2021-11-26/shopping-black-friday-cyber-m...,business,It used to be an American-only affair. The te...,use term black friday reportedli origin factor...,0,"{Black Friday deals, Cyber Monday, Boxing Day,..."
30,/news/2021-11-26/three-arrested-for-alleged-mo...,business,South Australia Police have charged two men a...,south australia polic charg two men woman drug...,0,"{Adelaide Magistrates Court, Kings Park, SA, S..."
31,/news/2021-11-26/uranium-miner-vimy-mulga-rock...,business,The company developing Western Australia's fi...,compani develop western first uranium mine say...,0,"{Kalgoorlie-Boulder, January, Western Australi..."
40,/news/2021-11-27/government-pays-three-capes-w...,business,A commercial walking company in Tasmania has ...,commerci walk compani tasmania receiv hundr th...,0,"{ABC, Parks, ABC News, Tasmania, State, Three ..."
