# COMP30810 Intro to Text Analytics 2018
# Homework 2

In [None]:
import pandas as pd
import nltk
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
import wordcloud as wc
import matplotlib.pyplot as plt

# Pre-Processing:

### Load data into Dataframe

In [None]:
df_handle = pd.read_csv('trainingset.csv',sep='^',header=0)
df_handle.head()

# Tokenization
### Extract Tokens from Raw Text

In [None]:
def extract_tokens(rawtext):
    """Split raw text into tokens."""
    pattern = r'\w+'
    tokenizer = RegexpTokenizer(pattern)
    return tokenizer.tokenize(rawtext)

### Remove Stop Words

In [None]:
stopwords_nltk_en = set(stopwords.words('english'))
# Combine nltk stopwords with some extra ones
STOP_WORDS = stopwords_nltk_en.union({"a","a's","able","about","above","according","accordingly","across","actually","after","afterwards","again","against","ain't","all","allow","allows","almost","alone","along","already","also","although","always","am","among","amongst","an","and","another","any","anybody","anyhow","anyone","anything","anyway","anyways","anywhere","apart","appear","appreciate","appropriate","are","aren't","around","as","aside","ask","asking","associated","at","available","away","awfully","b","be","became","because","become","becomes","becoming","been","before","beforehand","behind","being","believe","below","beside","besides","best","better","between","beyond","both","brief","but","by","c","c'mon","c's","came","can","can't","cannot","cant","cause","causes","certain","certainly","changes","clearly","co","com","come","comes","concerning","consequently","consider","considering","contain","containing","contains","corresponding","could","couldn't","course","currently","d","definitely","described","despite","did","didn't","different","do","does","doesn't","doing","don't","done","down","downwards","during","e","each","edu","eg","eight","either","else","elsewhere","enough","entirely","especially","et","etc","even","ever","every","everybody","everyone","everything","everywhere","ex","exactly","example","except","f","far","few","fifth","first","five","followed","following","follows","for","former","formerly","forth","four","from","further","furthermore","g","get","gets","getting","given","gives","go","goes","going","gone","got","gotten","greetings","h","had","hadn't","happens","hardly","has","hasn't","have","haven't","having","he","he's","hello","help","hence","her","here","here's","hereafter","hereby","herein","hereupon","hers","herself","hi","him","himself","his","hither","hopefully","how","howbeit","however","i","i'd","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch","inc","indeed","indicate","indicated","indicates","inner","insofar","instead","into","inward","is","isn't","it","it'd","it'll","it's","its","itself","j","just","k","keep","keeps","kept","know","known","knows","l","last","lately","later","latter","latterly","least","less","lest","let","let's","like","liked","likely","little","look","looking","looks","ltd","m","mainly","many","may","maybe","me","mean","meanwhile","merely","might","more","moreover","most","mostly","much","must","my","myself","n","name","namely","nd","near","nearly","necessary","need","needs","neither","never","nevertheless","new","next","nine","no","nobody","non","none","noone","nor","normally","not","nothing","novel","now","nowhere","o","obviously","of","off","often","oh","ok","okay","old","on","once","one","ones","only","onto","or","other","others","otherwise","ought","our","ours","ourselves","out","outside","over","overall","own","p","particular","particularly","per","perhaps","placed","please","plus","possible","presumably","probably","provides","q","que","quite","qv","r","rather","rd","re","really","reasonably","regarding","regardless","regards","relatively","respectively","right","s","said","same","saw","say","saying","says","second","secondly","see","seeing","seem","seemed","seeming","seems","seen","self","selves","sensible","sent","serious","seriously","seven","several","shall","she","should","shouldn't","since","six","so","some","somebody","somehow","someone","something","sometime","sometimes","somewhat","somewhere","soon","sorry","specified","specify","specifying","still","sub","such","sup","sure","t","t's","take","taken","tell","tends","th","than","thank","thanks","thanx","that","that's","thats","the","their","theirs","them","themselves","then","thence","there","there's","thereafter","thereby","therefore","therein","theres","thereupon","these","they","they'd","they'll","they're","they've","think","third","this","thorough","thoroughly","those","though","three","through","throughout","thru","thus","to","together","too","took","toward","towards","tried","tries","truly","try","trying","twice","two","u","un","under","unfortunately","unless","unlikely","until","unto","up","upon","us","use","used","useful","uses","using","usually","uucp","v","value","various","very","via","viz","vs","w","want","wants","was","wasn't","way","we","we'd","we'll","we're","we've","welcome","well","went","were","weren't","what","what's","whatever","when","whence","whenever","where","where's","whereafter","whereas","whereby","wherein","whereupon","wherever","whether","which","while","whither","who","who's","whoever","whole","whom","whose","why","will","willing","wish","with","within","without","won't","wonder","would","wouldn't","x","y","yes","yet","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves","z","zero"})

def remove_stopwords(tokens):
    return [word for word in tokens if word.lower() not in STOP_WORDS]

### Remove Capitalization

In [None]:
 def decapitalize(tokens):
    return [word.lower() for word in tokens]

### Remove Salutation

In [None]:
SALUTATIONS = ('mr','mrs','mss','dr','phd','prof','rev','professor')

def remove_salutations(tokens):
    return [word for word in tokens if word.lower() not in SALUTATIONS]

### Remove Numbers

In [None]:
def remove_numbers(tokens):
    return [word for word in tokens if not word.isdigit()]         

### Lemmatization

In [None]:
def transfer_tag(treebank_tag):
    if treebank_tag.startswith('j' or 'J'):
        return 'a'
    elif treebank_tag.startswith('v' or 'V'):
        return 'v'
    elif treebank_tag.startswith('n' or 'N'):
        return 'n'
    elif treebank_tag.startswith('r' or 'R'):
        return 'r'
    else:
        # As default pos in lemmatization is Noun
        return 'n' 

In [None]:
def lemmatize(tokens):
    wnl = WordNetLemmatizer()
    lemma_words = []
    for word, tag in nltk.pos_tag(tokens):
        firstletter = tag[0].lower() # -> get the first letter of tag and put them decapitalized form
        wtag = transfer_tag(firstletter) # -> extract the word's tag (noun, verb, adverb, adjective)
        lemma_words.append(wnl.lemmatize(word, wtag) if len(word)>2 else word) # -> get lemma for word with tag

    return lemma_words

In [None]:
def tokenize(rawtext):
    return lemmatize(
         remove_numbers(
             remove_salutations(
                 remove_stopwords(
                     decapitalize(
                         extract_tokens(
                             rawtext))))))

## Testing the Tokenizer
We'll test the tokenizer on a short sample of text to check for any issues:

In [None]:
sample_text = df_handle.content[0][:201]
print(sample_text)
print(tokenize(sample_text))

The results look fine except for the second token 'bos', which should be 'boss'. This is because the lemmatizer thinks that 'boss' is a plural, and so converts it to the singular form 'bos':

In [None]:
lemmatize(['boss'])

This is a bug in the lemmatizer, so we cannot fix it. Despite this minor issue, we will continue to use lemmatization as it is very useful even if it sometimes makes mistakes.

### Applying tokenization

In [None]:
df_handle['tokens'] = df_handle['content'].apply(tokenize)
df_handle.head()

# TF-IDF

In [None]:
# statistically check how important a word is to an article category
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(norm='l1')
document_token_strings = [' '.join(tokens) for tokens in df_handle.tokens]
tfidf_matrix = tfidf_vectorizer.fit_transform(document_token_strings).toarray()
df_handle['tfidf'] = list(tfidf_matrix)

In [None]:
type(df_handle.head().iloc[0].tfidf)

In [None]:
df_handle = df_handle[['content', 'tokens', 'tfidf', 'category']]

In [None]:
df_handle

In [None]:
df_handle.to_csv('./tfidf_data.csv', encoding='utf-8', index=False)

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df_handle, test_size=0.33)

In [None]:
print('Train: ' + repr(len(train)))
print('Test: ' + repr(len(test)))

In [None]:
def euclideanDistance(value1, value2):
    return np.linalg.norm(value1-value2)

In [None]:
def get_nearest_neighbours(vector):
    ret = []
    for index, row in train.iterrows():
        ret.append([row.category, euclideanDistance(row['tfidf'], vector)])
    return ret

In [None]:
test.iloc[0]

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
encoded_Tokens = []
for index in range(len(train)):
    encoded_Tokens.append(le.fit_transform(df_handle.iloc[index]['tokens']))

In [None]:
encoded_Labels = le.fit_transform(df_handle['category'].tolist())
encoded_Labels

In [None]:
X_train, X_test, y_train, y_test = train_test_split(le.fit_transform(df_handle['tokens']), encoded_Labels, test_size=0.33)


In [None]:
from sklearn.neighbors import KNeighborsClassifier

#Create KNN Classifier
knn = KNeighborsClassifier(n_neighbors=5)

#Train the model using the training sets
# knn.fit(X_train, y_train)

#Predict the response for test dataset
# y_pred = knn.predict(X_test)

In [None]:
# for index in range(len(encoded_Tokens)):
#     modelknn = KNeighborsClassifier(n_neighbors=5)
#     modelknn.fit(index, encoded_Labels)

# pred_KNN = modelknn.predict(test)

In [None]:
from collections import defaultdict
from operator import itemgetter

tp = 0
sample_size = 5

for i in range(len(test)):
    nearest_neighbours = get_nearest_neighbours(test.iloc[i]['tfidf'])
    sort_NN = list(sorted(nearest_neighbours, key=lambda x: x[1], reverse=True)) # sort the returned list of vectors in order of highest to loweest distance
    
    k=10
    votes = defaultdict(int) # create dictionary of votes and tallied votes
    for j in range(k):
        votes[sort_NN[j][0]] += 1
    final_vote = list(sorted(votes.items(), key=itemgetter(1), reverse=True ))[0][0] # put highest voted value first
    tp += int(final_vote == test.iloc[j]['category'])
    
accuracy = tp / len(test)
    # logic for choosing what got voted for
    # if category voted for equals best label 

In [None]:
print(accuracy)
print(final_vote)

## Model and Cross-Validation Using Sklearn

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

train, test = train_test_split(df_handle)
tfidf_transformer = TfidfVectorizer()
train_input_vectors = tfidf_transformer.fit_transform(train.content)
train_labels = train.category
scores = cross_val_score(MultinomialNB(), train_input_vectors, train_labels, cv=10)
print(scores)
print(np.mean(scores))

In [None]:
model = MultinomialNB(alpha=1.0)
model.fit(train_input_vectors, train_labels)

In [None]:
for i in range(10):
    print("Article %s:" % i)
    print(df_handle.content[i].split('.')[0])
    print("Model prediction: %s" % model.predict(train_input_vectors[i])[0])
    print()

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

names = [
    "Multinomial Naive Bayes",
    "Nearest Neighbors",
    "AdaBoost",
    "Linear SVM", 
    "RBF SVM",
    "Decision Tree",
    "Random Forest",  
#     "Naive Bayes",
#     "Neural Net",
#     "Gaussian Process",
         ]

classifiers = [
    MultinomialNB(alpha=1.0),
    KNeighborsClassifier(3),
    AdaBoostClassifier(),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
#     MLPClassifier(alpha=1), # took to long to run
#     GaussianProcessClassifier(1.0 * RBF(1.0)), # needs a 'dense matrix'?
#     GaussianNB(), # also needs a 'dense matrix'?
]

for name, classifier in zip(names, classifiers):
    k_folds = 10
    scores = cross_val_score(classifier, train_input_vectors, train_labels, cv=k_folds)
    average_accuracy = np.mean(scores)

    print("%s average accuracy (%d-fold x-val): %f" 
          % (name, k_folds, average_accuracy))

Results:
```
Multinomial Naive Bayes average accuracy (10-fold x-val): 0.950307
Nearest Neighbors average accuracy (10-fold x-val): 0.892863
AdaBoost average accuracy (10-fold x-val): 0.712065
Linear SVM average accuracy (10-fold x-val): 0.224502
RBF SVM average accuracy (10-fold x-val): 0.944214
Decision Tree average accuracy (10-fold x-val): 0.653080
Random Forest average accuracy (10-fold x-val): 0.323166
```

The Naive Bayes and RBF SVM are by far the most-promising. We should look into tweaking these models further to see if we can improve on the results.

The Nearest Neighbour model also performs well. We check to see which value of k provides the best results.

In [None]:
best_k = 0
best_k_accuracy = 0

for k in range(1,100):
    k_folds = 10
    scores = cross_val_score(KNeighborsClassifier(k), train_input_vectors, train_labels, cv=k_folds)
    average_accuracy = np.mean(scores)
    
    if average_accuracy > best_k_accuracy:
        best_k_accuracy = average_accuracy
        best_k = k

    print("KNN (k=%d) average accuracy (%d-fold x-val): %f" 
          % (k, k_folds, average_accuracy))
    

print("Best k value is %d" % best_k)

The best-performing KNN uses k=6, accuracy = 0.916898. Still not as good as the NB or RBF SVM.

## Ensembles

#### Voting ensemble of the best three classifiers from before:

In [None]:
import sklearn.ensemble

best_classifiers = [
    ('MultinomialNB', MultinomialNB(alpha=1.0)),
    ('KNN', KNeighborsClassifier(6)),
    ('SVM', SVC(gamma=2, C=1)),
]

voting_ensemble = sklearn.ensemble.VotingClassifier(best_classifiers)
scores = cross_val_score(voting_ensemble, train_input_vectors, train_labels, cv=10)
print("Average 10-fold cross-validation accuracy: %f" % np.mean(scores))

Results:

Average 10-fold cross-validation accuracy: 0.9486095170000001

This is slightly worse than MultinomialNB from our previous experiment. We have run these experiments several times and sometimes MultinomialNB is worse (~93% accuracy), but in general it seems that MultimonialNB is about as good as or better than the ensemble.

#### Bagging ensemble of the best classifier:

In [None]:
base_classifier = MultinomialNB(alpha=1.0)
bagging_ensemble = sklearn.ensemble.BaggingClassifier(base_classifier)
scores = cross_val_score(bagging_ensemble, train_input_vectors, train_labels, cv=10)
print("Average 10-fold cross-validation accuracy: %f" % np.mean(scores))

Results:

Average 10-fold cross-validation accuracy: 0.926422

Again this is slightly worse than the standard MultinomialNB.

#### Boosting ensemble of the best classifier:

In [None]:
base_classifier = MultinomialNB(alpha=1.0)
boosting_classifier = sklearn.ensemble.AdaBoostClassifier(base_classifier)
scores = cross_val_score(boosting_classifier, train_input_vectors, train_labels, cv=10)
print("Average 10-fold cross-validation accuracy: %f" % np.mean(scores))

Results:

Average 10-fold cross-validation accuracy: 0.666899

This is significantly worse than standard MultinomialNB.

### Conclusion
A voting ensemble, bagging ensemble, and boosting ensemble all achieved worse results than a standard MultinomialNB model. For this reason we will not use an ensemble for our final model.

# Data Analysis

Now we turn our attention to exploring and analysing the data that we have cleaned, transformed, tokenized and saved into our data frame. We will look at a number of different features of the data, including:
- Class-Label Analysis
    - Class Frequency and Balance
    - Class Statistics
    - Class Similarity
- Word Clouds for Tokens and Nouns
- Most-Common and Words, Bigrams and Trigrams

## Most-Common N-Grams

We explore our corpus by looking at which words and phrases are most common. We break this down by document category in order to get a feel for which words are most indicative of a particular class, and therefore the words that should be considered most important by the classifier. 

In [None]:
def freq_dist_to_df(freq_dist, top_n):
    """
    Convert a frequency distribution to a Data Frame, with columns 'term' and 'count'.
    """
    most_common_terms = []
    
    for term, count in freq_dist.most_common(top_n):
        most_common_terms.append({'term': term, 'count': count})
    
    most_common_df = pd.DataFrame(most_common_terms)
    most_common_df.sort_values(by='count', ascending=True, inplace=True)
    
    # Change term labels from being (x, y, z) to "x y z".
    join_tuple = lambda terms: " ".join(terms)
    most_common_df.term = most_common_df.term.apply(join_tuple)

    return most_common_df


def plot_frequency_df(df, title, xlabel="Frequency", ylabel="", fileName=None):
    """
    Plot a horizontal bar chart of the most common words in a given 
    dataframe, with columns 'term' and 'count'. The bars are sorted 
    so that the most-common word appears at the top.
    """
    fig, axis = plt.subplots()

    df.plot.barh(x='term', y='count', ax=axis)
    
    axis.set_title(title)
    axis.set_xlabel(xlabel)
    axis.set_ylabel(ylabel)

    plt.tight_layout()

    # Save figure to an image file.
    if fileName is not None:
        plt.savefig("images/" + fileName)

    # This should only be executed after plt.savefig(),
    # otherwise the image saved will be blank.
    plt.show()

In [None]:
def plot_ngrams(ngrams_data, n=1, file_name=None):
    """
    Plot the frequency of n-grams in a Series, for a given n.
    """
    # Get all the tokens for the corpus in a list.
    all_tokens_list = [token for tokens in ngrams_data for token in tokens]

    ngrams = nltk.ngrams(all_tokens_list, n)

    # Get the frequency of each term in the corpus and create a DataFrame from it.
    ngram_frequency = nltk.FreqDist(ngrams)
    ngrams_df = freq_dist_to_df(ngram_frequency, 5)
    
    ngram_name = "Word"  # default
    if n == 2:
        ngram_name = "Bigram"
    elif n == 3:
        ngram_name = "Trigram"

    plot_frequency_df(ngrams_df, title="Top-10 Most-Common %s for Entire Corpus" % ngram_name,
                      xlabel="Frequency", ylabel=ngram_name, fileName=file_name)

In [None]:
plot_ngrams(df_handle.tokens, n=1, file_name="most-common-unigram.png")

In [None]:
plot_ngrams(df_handle.tokens, n=2, file_name="most-common-bigram.png")

In [None]:
plot_ngrams(df_handle.tokens, n=3, file_name="most-common-trigram.png")

![Most Common Unigrams](images/most-common-unigram.png)

The most-common words for the entire corpus don't really give us a huge amount of insight about the data. The most-common word is "year" but this could be used in any of the contexts of the document categories. Similarly for the word "make", "people" and "time". However, one word that does stand out is "game", which we would primarily associate with sport. However, it is possible that this word is applicable to other categories too. We will investigate the most-common words per category later to see if the game-sport assocation is supported.

![Most Common Bigrams](images/most-common-bigram.png)
![Most Common Trigrams](images/most-common-trigram.png)

We get more information from analysing the bigrams and trigrams than the single words (also known as *unigrams*). For example, we see that the top bigram is "tell bbc" and the three most common trigrams are "tell bbc news", "bbc news website" and "bbc news radio". This would seem to suggest that the source of our news articles is BBC News. In fact, this is also supported by the reference to former-British Prime Minister, Tony Blair, and former-British Conservative Party Leader, Michael Howard, which are referenced by the fifth most common bigram and fourth most common trigram, respectively. 

Furthermore, we see that there are two bigrams that stand out for their association with particular document categories. "Prime Minister" is definitely associated with politics and "Chief Executive" is similarly associated with the business topic. We will investigate the importance of these n-grams later when dicussing the important terms and phrases for classifying each topic. 

The fifth most common trigram is "million dollar baby". This could be a reference to the boxing film of the same name that won multiple Oscars in 2005. This trigram could definitely be associated with the entertainment category. However, seeing as it's a film that is related to sports, it could also be associated in some way to the sports category. We will investigate the overlap between categories and their similarities later on in our data analysis.

## Class-Label Analysis

Let's look at the labels that we have been provided with for each of the documents in our corpus. There are five possible classes that the label can be: sport, entertainment, politics, technology and business. We also refer to our class labels as the *target label* of each document.

### Class Frequency and Balance
To start with, we look at how many documents of each class appear in our dataset.

In [None]:
class_distribution = df_handle.groupby(df_handle.category).category.count()
class_distribution

We normalise these absolute numbers to get a breakdown of the percentage frequency of each target class. This makes it easy to compare the figures.

In [None]:
class_distribution_normalised = (class_distribution / sum(class_distribution)) * 100
# Plot a line representing the mean frequency of all classes.
# This is the frequency each class would have if uniformly distributed.
mean_freq = 100 / len(class_distribution)

fig, axis = plt.subplots()
axis.axhline(y=mean_freq, color='orange')
class_distribution_normalised.plot.bar(ax=axis, rot=0)
axis.set_xlabel("Target Class (Document Category)")
axis.set_ylabel("Frequency (% of Corpus)")
axis.set_title("Target Class Frequencies Across Corpus")

plt.savefig('images/target-class-frequencies.png')
plt.show()

![Target Class Frequencies Image](images/target-class-frequencies.png)

We see that we have a slighly unbalanced target class distribution. This means that not all target labels appear as often as each other. For example, the sport category is most common, whereas the entertainment category is the least common. When building our model, we need to take this into account, to make sure that we are not overly biasing our model based on the distribution of target classes in the training data. In extreme cases, our model could underfit the data and end up predicting the most common label (sport) each time. 

While this would be easy to spot and fix, it is possible that the class distribtion will affect the model in more subtle ways. We will investigate and evaluate this later, when we have made our predictions for the test dataset.

### Class Statistics

Now, we look at some meta-statistics about each document. We compare the length of each document as well as the number of unique words in each document. We breakdown this analysis per-target class.

#### Document Length

In [None]:
df_handle['content_length'] = df_handle.content.apply(len)

fig, axis = plt.subplots(1, 2, sharey=True)

df_handle.plot.hist(by='content_length', bins=5, ax=axis[0], figsize=(10,4), legend=None)
axis[0].set_xlabel("Document Length")
axis[0].set_title("Document Length Distribution")

clipped_content_length_data = df_handle[(df_handle.content_length > 1000) & (df_handle.content_length < 5000)]
clipped_content_length_data.plot.hist(by='content_length', bins=5, ax=axis[1], figsize=(10, 4), legend=None)
axis[1].set_xlabel("Document Length")
axis[1].set_title("Document Length Distribution (clipped)")

plt.tight_layout()
plt.savefig("images/document-length-distribution.png")
plt.show()

![Document Length Distributions](images/document-length-distribution.png)

From the first histogram on the left, we see that the overwhelming majority of documents have a length between 1000 and 5000 characters. We don't get a huge amount of information from this plot so we replot the data, except that we filter the dataset so that we only include documents whose length is within this interval. This process is known as *clipping*. The clipped data is plotted on the right histogram.

From the second histogram, we get more useful data. We see most of the documents have a length between 1000 and 2500 characters. It is important that we are aware that the distribution of the lengths of each document is not uniform. We also break this down per-target label.

In [None]:
content_length_per_class = df_handle.groupby(df_handle.category).content_length.mean()

fig, axis = plt.subplots()

content_length_per_class.plot.bar(ax=axis, rot=0)

# We plot the average document length for reference.
mean_doc_length = df_handle['content_length'].mean()
axis.axhline(y=mean_doc_length, color='orange')

axis.set_xlabel("Target Class (Document Category)")
axis.set_ylabel("Mean Document Length")
axis.set_title("Mean Document Length per Target Class")

plt.savefig("images/document-length-per-class.png")
plt.show()

![Mean Document Length per Target Class Bar Chart](images/document-length-per-class.png)

Again, we see that we don't have uniform distribution of document length. This is important to recall, since, when we are using methods to vectorise the documents, if we don't take into account the documents' length, then we will be unfairly biasing longer or shorter documents (depending on the vectorising method). 

For example, if we are calculating the frequency of a term, comparing this across documents that have different lengths is unfair. A word that appears 10 times in a 500 character document is far more prominent that a word that appears 20 times in a 5000 chracter document. So we need to make sure that we normalise our frequency calculation (e.g. during TFIDF vectorisation) by the length of the doucment the term appears in. We will return to this point later when we are doing the document vectorisation.

### Class Similarity

TODO (mm): Calculate the overlap/cosine similarity between each of the classes. Mention how if we have classes that are often similar, our classifier model could become confused by them. E.g An incorrect 51-49-0-0-0 is better than a correct 21-20-20-20-19 prediction if the document is solely about a single topic with no overlap with others.

In [None]:
def createAndPlotWordCloud(tokens):
    # The word cloud uses a sentence string not a list,
    # so we convert our tokens to a single string.
    tokens_string = " ".join(tokens)
    
    # Create the wordcloud object.
    wordcloud = wc.WordCloud(width=1600, height=800).generate(tokens_string)
    
    # Matplotlib settings.
    plt.figure(figsize=(20,10), facecolor='k')
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.tight_layout(pad=0)
    plt.show()
    
    return wordcloud


def getAllTokens(df):
    """
    Convert the tokens column in a dataframe into a list of strings of every 
    token in the tokens column.
    """
    return [token for tokens in df['tokens'] for token in tokens]

## Word Clouds

Create the wordcloud image and show it. We also save it to a file called *corpus_wordcloud.png*.

In [None]:
# Get all the tokens from our data.
all_tokens_list = getAllTokens(df_handle)

wordcloud = createAndPlotWordCloud(all_tokens_list)
wordcloud.to_file("images/corpus_wordcloud.png")

# Delete this since it's a variable that will be used later.
del all_tokens_list

TODO(mm): Analyse.

![Wordcloud for Tokens of the Entire Corpus](images/corpus_wordcloud.png)