# Load Data

In [None]:
"""Notebook for the classification of newsgroups into different categories"""


#import relevant packages
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups

In [None]:
#load data
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]
#fetching the datasetm though removing metadata that is useless for text analyis
dataset = fetch_20newsgroups(subset='all', categories=categories,
                             shuffle=True, random_state=42, remove=('headers', 'footers', 'quotes'))

x = dataset.data  # this is the data
y = dataset.target  # these are the class labels
len(x), len(y)

In [None]:
df_text = pd.DataFrame(x)
df_text.iloc[0, 0]

In [None]:
df_topic = pd.DataFrame(y,columns=['class_number'])
df_topic['class_number'].value_counts()

The x dataset has been converted to the data frame df_text - value or sample is a single post from a single user.

The y (df_topic) dataset contains the topic label for each post in the x dataset. The labels are represented by numbers. Also note that the data seems relatively balanced, which makes classification less complicated.

# Preprocessing

There are many potential steps that can be taken to clean text data. In this notebook, I will do the following:

1. Remove unnecesary characters and text sections
    - characters such as '\n', '@', '/', '\', '<', '>', '#', all digits, etc. do not have any bearing on the subject and can create noise when identifying words
    - certain sections, like usernames, email addresses, and subjects lines also are not completely relevant to the subject - most of these were removed in the cell above in which I retrieved the data
    - remove punctuation like '.', ';', ':' and ',' as these create noise in identifying words. Apostrophes (') are included in written words in english so they are worth keeping for now; however, for the future it may be worth examining whether it helps to split contractions in component words

2. Tokenize text into separate words
    - this involves removing stop words including but not limited to: 'the', 'and', pronouns like 'it', 'he' and 'she' and prepositions. These words add no information about the topic of the document
    - it is also important to remove capital letters to reduce noise

3. Leverage word lemmazation or stemming to remove prefixes, suffixes and generally reduce words to their deictionary form
    - this will also involve part of speech tagging for lemmazation, stemming however is simpler and faster but less accurate. I will use stemming for now.


In [None]:
# using nltk library for quick text preprocessing
import string
from nltk.corpus import stopwords as sw
from nltk import wordpunct_tokenize as wt
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
# packages below are used for lemmazation
# from nltk.corpus import wordnet as wn
# from nltk import WordNetLemmatizer as lem
# from nltk import pos_tag as pos

In [None]:
# constants
FORMAT_WORDS = ['From', 'Subject', 'Lines', 'Organization', 'Date', 'writes']
PRONOUNS = ['i', 'he', 'she', 'it', 'they', 'you']
STOPWORDS = set(sw.words('english'))


    
    
# create function to process each document
def clean(text, format_words, stop_words, pronouns, bank, doc):
    """function that performs all preprocing tasks"""
    #split into lines to remove newline characters
    lines = [line for line in text.split('\n')]
    #remove unecesary formating lines (with email addresses and other non-relevant info) and characters
    for line in lines:
#         check_counter = 0
#         for word in format_words:
#             if word not in line:
#                 check_counter += 1
#             elif word in line:
#                 continue
#         if check_counter == len(format_words):
            #remove punction characters and pronouns, also remove random letters than could be middle initials
        for word in wt(line):
            word = word.strip('_').lower()
            if word in stop_words or word in pronouns or len(word) == 1:
                continue
            if all(char in set(string.punctuation) for char in word):
                continue
            if any(char in ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] for char in word):
                continue
            #apply stemmer
            stemmer = SnowballStemmer('english')
            new_word = stemmer.stem(word)
            if new_word not in bank.keys():
                bank[new_word] = 1  
            else:
                bank[new_word] += 1
            doc.append(new_word)
                
def create_dicts(dataset):
        # bank of all words
        wordbank = {}
        # dictionary where document indexes are stored for
        doc_dict = {}
        """function to build the dictionary"""
        for  i in range(len(dataset)):
            doc_dict[i] = []
            clean(dataset[i], FORMAT_WORDS, STOPWORDS, PRONOUNS, wordbank, doc_dict[i])
            doc_dict[i] = " ".join(doc_dict[i])
        return wordbank, doc_dict

# below function used for lemmazation
#def convert_pos(tag):
#     """converts parts of speech from penn tree format to wordnet format"""
#     poskey = {'N': wn.NOUN, 'V': wn.VERB, 'J': wn.ADJ, 'R': wn.ADV}
#     print(poskey.get(tag[0]))
#     return poskey.get(tag[0])

In [None]:
# create the dictionary of word counts among all documents
wordbank, doc_dict = create_dicts(x)

In [None]:
print(len(wordbank.keys()))
sorted(wordbank)

In [None]:
#sort dictionary by values
sort_bank = sorted(wordbank.items(), key=lambda x: x[1], reverse=True)
#top 1000 words and counts
sort_bank[0: 1000]

In [None]:
# create word document matrix
doc_series = pd.Series(doc_dict)
vect = CountVectorizer()
matrix = vect.fit_transform(doc_series)

In [None]:
#view sparse matrix
df_matrix = pd.SparseDataFrame(matrix, columns=vect.get_feature_names()).fillna(0)
# for i, col in enumerate(vect.get_feature_names()):
#     df_matrix[col] = pd.SparseSeries(matrix[:, i].toarray().ravel(), fill_value=0)
df_matrix

In [None]:
df_matrix.describe()

# Demensionality Reduction and Clustering

In [None]:
# Using dimensionality reduction with LSA (svd)
# this requires the use of a tfidf vectorizer rather than the count vectorizer, 
# which accounts for cosine similarity and normalizes the data
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import Normalizer
from sklearn.cluster import KMeans

In [None]:
#Implement algorithm, use dimensionality reduction to reduce to 10s or 100s of dimensions
CLUSTERS = np.unique(y).shape[0]
COMPONENTS = 80

tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(doc_series)
#output of svd needs to be normalized to improve k-means performance
svd = TruncatedSVD(n_components=COMPONENTS)
normal = Normalizer(copy=False)
lsa = make_pipeline(svd, normal)
matrix_reduced = lsa.fit_transform(tfidf_matrix)
matrix_reduced

In [None]:
cluster = KMeans(n_clusters=CLUSTERS, random_state=42)
cluster.fit(matrix_reduced)
cluster.labels_

In [None]:
#evaluate performance
#we should avoid using accuracy as the labels may not correspond exactly, instead use the following measures:
#homogeniety examines whether all members of the same cluster have the same class
#completeness measures whether all mebers of the same class are in the same cluster
#v-measure is an entropy-based measure designed to evaluate whether both homogeniety and completeness have been achieved
#these measures should evaluate whether the clustering fit the data well
from sklearn import metrics

homogeniety, completeness, v_measure = metrics.homogeneity_completeness_v_measure(y, cluster.labels_)
print('homogeneity', homogeniety)
print('completness', completeness)
print('v_measure', v_measure)

# Classification

1. To generate features, this example uses TF-IDF, which creates a matrix of documents and word tokens and assigns weights based on both the term frequency, which is used to determine how common a word is in a specific documen, and the inverse document frequency, which is used to determine how common a word is amongst all documents. The use of both these elements in detemining the value of the each feature removes noise from commonly used words and increased word counts in longer documents. This makes the data more relevant to the topic. It is also import to note that there is no dimensionality reduction done here into order to keep as much information as possible.

2.  Given that this is a decent size dataset with a lot of features, the Stochastic Gradient Descent Calssifier was chosen due to its ability to deal with lots of data quickly. Hyperparameter optimzation was done with a package to make it go more quickly.

In [None]:

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import KFold, cross_validate
%matplotlib inline

TEST = 500/matrix_reduced.shape[0]
SPLITS = 10


tfidf2 = TfidfVectorizer(stop_words='english')
#initialize estimator
sgd = SGDClassifier(random_state=42)
#separate test set
X_tv, X_test, y_tv, y_test = train_test_split(tfidf2.fit_transform(doc_series), y, test_size=TEST, random_state=42)

sgd.fit(X_tv, y_tv)
y_hat = sgd.predict(X_test)
print('Test F1:', metrics.f1_score(y_test, y_hat, average='macro'))
print('Test Accuracy:', metrics.accuracy_score(y_test, y_hat))

In [None]:
import hyperopt as hp
#specify boundaries for hyperparater optimization
L1L = 0.0
L1U = 1.0
ALPHAL =  -9 * np.log(10)
ALPHAU =  -4 * np.log(10)
NITER = 20 + 5 * hp.hp.randint('clf__n_iter', 12)
EVALUATION = 10

hyp_dict = {}
hyp_dict['l1_ratio'] = hp.hp.uniform('clf__l1_ratio', L1L, L1U)      
hyp_dict['alpha'] = hp.hp.loguniform('clf__alpha', ALPHAL, ALPHAU)     
hyp_dict['n_iter'] = NITER

#create variable for use with multiclass analysis
score_var = ['f1_macro']
#use pipeline for easier hyper parameter optimization

def hyper(parameters):
    sgd.set_params(**parameters)
    kfold = KFold(n_splits=SPLITS, shuffle=True)
    score = cross_validate(sgd, X_tv, y_tv, scoring=score_var,
                         cv=kfold)
    return 1-score['test_f1_macro'].mean()

opt = hp.fmin(hyper, hyp_dict, algo=hp.tpe.suggest, max_evals=EVALUATION)

params = hp.space_eval(hyp_dict, opt)

sgd.set_params(**params).fit(X_tv, y_tv)
y_hat = sgd.predict(X_test)
print('Test F1:', metrics.f1_score(y_test, y_hat, average='macro'))
print('Test Accuracy:', metrics.accuracy_score(y_test, y_hat))