# COMP30810 Intro to Text Analytics 2018
# Homework 2

In [None]:
import pandas as pd
import nltk
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

# Pre-Processing:

### Load data into Dataframe

In [None]:
raw_trainset = pd.read_csv('trainingset.csv',sep='^',header=0)
raw_trainset

# Tokenization
### Extract Tokens from Raw Text

In [None]:
def extract_tokens(rawtext):
    #Split the Raw Text into Tokens by using the Regular Expression Filter
    pattern = r'\w+'
    tokenizer = RegexpTokenizer(pattern)
    token_words = tokenizer.tokenize(rawtext)

    return token_words

### Remove Stop Words

In [None]:
stopwords_nltk_en = set(stopwords.words('english'))
# Combine the stopwords
STOP_WORDS = stopwords_nltk_en.union({"a","a's","able","about","above","according","accordingly","across","actually","after","afterwards","again","against","ain't","all","allow","allows","almost","alone","along","already","also","although","always","am","among","amongst","an","and","another","any","anybody","anyhow","anyone","anything","anyway","anyways","anywhere","apart","appear","appreciate","appropriate","are","aren't","around","as","aside","ask","asking","associated","at","available","away","awfully","b","be","became","because","become","becomes","becoming","been","before","beforehand","behind","being","believe","below","beside","besides","best","better","between","beyond","both","brief","but","by","c","c'mon","c's","came","can","can't","cannot","cant","cause","causes","certain","certainly","changes","clearly","co","com","come","comes","concerning","consequently","consider","considering","contain","containing","contains","corresponding","could","couldn't","course","currently","d","definitely","described","despite","did","didn't","different","do","does","doesn't","doing","don't","done","down","downwards","during","e","each","edu","eg","eight","either","else","elsewhere","enough","entirely","especially","et","etc","even","ever","every","everybody","everyone","everything","everywhere","ex","exactly","example","except","f","far","few","fifth","first","five","followed","following","follows","for","former","formerly","forth","four","from","further","furthermore","g","get","gets","getting","given","gives","go","goes","going","gone","got","gotten","greetings","h","had","hadn't","happens","hardly","has","hasn't","have","haven't","having","he","he's","hello","help","hence","her","here","here's","hereafter","hereby","herein","hereupon","hers","herself","hi","him","himself","his","hither","hopefully","how","howbeit","however","i","i'd","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch","inc","indeed","indicate","indicated","indicates","inner","insofar","instead","into","inward","is","isn't","it","it'd","it'll","it's","its","itself","j","just","k","keep","keeps","kept","know","known","knows","l","last","lately","later","latter","latterly","least","less","lest","let","let's","like","liked","likely","little","look","looking","looks","ltd","m","mainly","many","may","maybe","me","mean","meanwhile","merely","might","more","moreover","most","mostly","much","must","my","myself","n","name","namely","nd","near","nearly","necessary","need","needs","neither","never","nevertheless","new","next","nine","no","nobody","non","none","noone","nor","normally","not","nothing","novel","now","nowhere","o","obviously","of","off","often","oh","ok","okay","old","on","once","one","ones","only","onto","or","other","others","otherwise","ought","our","ours","ourselves","out","outside","over","overall","own","p","particular","particularly","per","perhaps","placed","please","plus","possible","presumably","probably","provides","q","que","quite","qv","r","rather","rd","re","really","reasonably","regarding","regardless","regards","relatively","respectively","right","s","said","same","saw","say","saying","says","second","secondly","see","seeing","seem","seemed","seeming","seems","seen","self","selves","sensible","sent","serious","seriously","seven","several","shall","she","should","shouldn't","since","six","so","some","somebody","somehow","someone","something","sometime","sometimes","somewhat","somewhere","soon","sorry","specified","specify","specifying","still","sub","such","sup","sure","t","t's","take","taken","tell","tends","th","than","thank","thanks","thanx","that","that's","thats","the","their","theirs","them","themselves","then","thence","there","there's","thereafter","thereby","therefore","therein","theres","thereupon","these","they","they'd","they'll","they're","they've","think","third","this","thorough","thoroughly","those","though","three","through","throughout","thru","thus","to","together","too","took","toward","towards","tried","tries","truly","try","trying","twice","two","u","un","under","unfortunately","unless","unlikely","until","unto","up","upon","us","use","used","useful","uses","using","usually","uucp","v","value","various","very","via","viz","vs","w","want","wants","was","wasn't","way","we","we'd","we'll","we're","we've","welcome","well","went","were","weren't","what","what's","whatever","when","whence","whenever","where","where's","whereafter","whereas","whereby","wherein","whereupon","wherever","whether","which","while","whither","who","who's","whoever","whole","whom","whose","why","will","willing","wish","with","within","without","won't","wonder","would","wouldn't","x","y","yes","yet","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves","z","zero"})

def remove_stopwords(token_words):
    rm_stop_words_tokens = ([word for word in token_words if word.lower() not in STOP_WORDS])

    return rm_stop_words_tokens

### Remove Capitalization

In [None]:
 def decapitalize(rm_stop_words_tokens):
    rmcap_token_words = [word.lower() for word in rm_stop_words_tokens]
    
    return rmcap_token_words

### Remove Salutation

In [None]:
def remove_salutation(rmcap_token_words):
    salutation = ['mr','mrs','mss','dr','phd','prof','rev', 'professor']
    rmsalu_token_words = ([word for word in rmcap_token_words if word.lower() not in salutation])

    return rmsalu_token_words

### Remove Numbers

In [None]:
def remove_numbers(rmsalu_token_words):
    rm_num_token_words = ([word for word in rmsalu_token_words if not word.isdigit()])

    return rm_num_token_words
         

### Lemmatization

In [None]:
def transfer_tag(treebank_tag):
    if treebank_tag.startswith('j' or 'J'):
        return 'a'
    elif treebank_tag.startswith('v' or 'V'):
        return 'v'
    elif treebank_tag.startswith('n' or 'N'):
        return 'n'
    elif treebank_tag.startswith('r' or 'R'):
        return 'r'
    else:
        # As default pos in lemmatization is Noun
        return 'n' 

In [None]:
def lemmatize(rm_num_token_words, verbose=False):
    wnl = WordNetLemmatizer()

    lemma_words = []
    for word, tag in nltk.pos_tag(rm_num_token_words):
        firstletter = tag[0].lower() # -> get the first letter of tag and put them decapitalized form
        wtag = transfer_tag(firstletter) # -> extract the word's tag (noun, verb, adverb, adjective)
        lemma_words.append(wnl.lemmatize(word, wtag)) # -> get lemma for word with tag
    if (verbose):
        print('Lemmas : ' + str(lemma_words[0:10]))
        
    return lemma_words

In [None]:
def tokenize(rawtext):
    return lemmatize(
         remove_numbers(
             remove_salutation(
                 remove_stopwords(
                     decapitalize(
                         extract_tokens(
                             rawtext))))))
    
    
    

In [None]:
df_handle = raw_trainset.copy()
df_handle['tokens'] = df_handle['content'].apply(tokenize)

In [None]:
df_handle.head(10)

# TF-IDF

In [None]:
# statistically check how important a word is to an article category
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(norm='l1')
document_token_strings = [' '.join(tokens) for tokens in df_handle.tokens]
tfidf_matrix = tfidf_vectorizer.fit_transform(document_token_strings).toarray()
df_handle['tfidf'] = list(tfidf_matrix)

In [None]:
type(df_handle.head().iloc[0].tfidf)

In [None]:
df_handle = df_handle[['content', 'tokens', 'tfidf', 'category']]

In [None]:
df_handle

In [None]:
df_handle.to_csv('./tfidf_data.csv', encoding='utf-8', index=False)

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df_handle, test_size=0.33)

In [None]:
print('Train: ' + repr(len(train)))
print('Test: ' + repr(len(test)))

In [None]:
def euclideanDistance(value1, value2):
    return np.linalg.norm(value1-value2)

In [None]:
def get_nearest_neighbours(vector):
    ret = []
    for index, row in train.iterrows():
        ret.append([row.category, euclideanDistance(row['tfidf'], vector)])
    return ret

In [None]:
test.iloc[0]

In [None]:
from collections import defaultdict
from operator import itemgetter

tp = 0
sample_size = 5

for i in range(len(test)):
    nearest_neighbours = get_nearest_neighbours(test.iloc[i]['tfidf'])
    sort_NN = list(sorted(nearest_neighbours, key=lambda x: x[1], reverse=True)) # sort the returned list of vectors in order of highest to loweest distance
    
    k=10
    votes = defaultdict(int) # create dictionary of votes and tallied votes
    for j in range(k):
        votes[sort_NN[j][0]] += 1
    final_vote = list(sorted(votes.items(), key=itemgetter(1), reverse=True ))[0][0] # put highest voted value first
    tp += int(final_vote == test.iloc[j]['category'])
    
accuracy = tp / len(test)
    # logic for choosing what got voted for
    # if category voted for equals best label 

In [None]:
print(accuracy)
print(final_vote)

## Model and Cross-Validation Using Sklearn

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

train, test = train_test_split(raw_trainset)
tfidf_transformer = TfidfVectorizer()
train_input_vectors = tfidf_transformer.fit_transform(train.content)
train_labels = train.category
scores = cross_val_score(MultinomialNB(), train_input_vectors, train_labels, cv=10)
print(scores)
print(np.mean(scores))

In [None]:
model = MultinomialNB(alpha=1.0)
model.fit(X_train_tfidf, y_train)

In [None]:
for i in range(10):
    print("Article %s:" % i)
    print(data.content[i].split('.')[0])
    print("Model prediction: %s" % model.predict(X_train_tfidf[i])[0])
    print()

## More Sklearn Models

Try out more models from Sklearn and report their accuracy.

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

names = [
    "Multinomial Naive Bayes",
    "Nearest Neighbors",
    "AdaBoost",
    "Linear SVM", 
    "RBF SVM",
    "Decision Tree",
    "Random Forest",  
#     "Naive Bayes",
#     "Neural Net",
#     "Gaussian Process",
         ]

classifiers = [
    MultinomialNB(alpha=1.0),
    KNeighborsClassifier(3),
    AdaBoostClassifier(),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
#     MLPClassifier(alpha=1), # took to long to run
#     GaussianProcessClassifier(1.0 * RBF(1.0)), # needs a 'dense matrix'?
#     GaussianNB(), # also needs a 'dense matrix'?
]

for name, classifier in zip(names, classifiers):
    k_folds = 10
    scores = cross_val_score(classifier, train_input_vectors, train_labels, cv=k_folds)
    average_accuracy = np.mean(scores)

    print("%s average accuracy (%d-fold x-val): %f" 
          % (name, k_folds, average_accuracy))

Results:
```
Multinomial Naive Bayes average accuracy (10-fold x-val): 0.950307
Nearest Neighbors average accuracy (10-fold x-val): 0.892863
AdaBoost average accuracy (10-fold x-val): 0.712065
Linear SVM average accuracy (10-fold x-val): 0.224502
RBF SVM average accuracy (10-fold x-val): 0.944214
Decision Tree average accuracy (10-fold x-val): 0.653080
Random Forest average accuracy (10-fold x-val): 0.323166
```

The Naive Bayes and RBF SVM are by far the most-promising. We should look into tweaking these models further to see if we can improve on the results.

The Nearest Neighbour model also performs well. We check to see which value of k provides the best results.

In [None]:
best_k = 0
best_k_accuracy = 0

for k in range(1,100):
    k_folds = 10
    scores = cross_val_score(KNeighborsClassifier(k), train_input_vectors, train_labels, cv=k_folds)
    average_accuracy = np.mean(scores)
    
    if average_accuracy > best_k_accuracy:
        best_k_accuracy = average_accuracy
        best_k = k

    print("KNN (k=%d) average accuracy (%d-fold x-val): %f" 
          % (k, k_folds, average_accuracy))
    

print("Best k value is %d" % best_k)

The best-performing KNN uses k=6, accuracy = 0.916898. Still not as good as the NB or RBF SVM.

## Ensembles

Voting ensemble of the best three classifiers from before:

In [None]:
import sklearn.ensemble

best_classifiers = [
    ('MultinomialNB', MultinomialNB(alpha=1.0)),
    ('KNN', KNeighborsClassifier(6)),
    ('SVM', SVC(gamma=2, C=1)),
]

voting_ensemble = sklearn.ensemble.VotingClassifier(best_classifiers)
scores = cross_val_score(voting_ensemble, train_input_vectors, train_labels, cv=10)
print("Average 10-fold cross-validation accuracy: %f" % np.mean(scores))

Results:

Average 10-fold cross-validation accuracy: 0.9486095170000001

This is slightly worse than MultinomialNB from our previous experiment. We have run these experiments several times and sometimes MultinomialNB is worse (~93% accuracy), but in general it seems that MultimonialNB is about as good as or better than the ensemble.

Bagging ensemble of the best classifier:

In [None]:
base_classifier = MultinomialNB(alpha=1.0)
bagging_ensemble = sklearn.ensemble.BaggingClassifier(base_classifier)
scores = cross_val_score(bagging_ensemble, train_input_vectors, train_labels, cv=10)
print("Average 10-fold cross-validation accuracy: %f" % np.mean(scores))

Results:

Average 10-fold cross-validation accuracy: 0.926422

Again this is slightly worse than the standard MultinomialNB.

In [None]:
base_classifier = MultinomialNB(alpha=1.0)
boosting_classifier = sklearn.ensemble.AdaBoostClassifier(base_classifier)
scores = cross_val_score(boosting_classifier, train_input_vectors, train_labels, cv=10)
print("Average 10-fold cross-validation accuracy: %f" % np.mean(scores))

Results:

Average 10-fold cross-validation accuracy: 0.666899

This is significantly worse than standard MultinomialNB.

### Conclusion
A voting ensemble, bagging ensemble, and boosting ensemble all achieved worse results than a standard MultinomialNB model. For this reason we will not use an ensemble for our final model.