In [39]:
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None
import math
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from collections import defaultdict
from collections import Counter

## PART I

In [40]:
df = pd.read_csv("English_Dataset.csv")

In [41]:
sport_df = df[df['Category'] == "sport"]
business_df = df[df['Category'] == "business"]
politics_df = df[df['Category'] == "politics"]
entertainment_df = df[df['Category'] == "entertainment"]
tech_df = df[df['Category'] == "tech"]

In [42]:
sport_df['Text'] = sport_df['Text'].apply(lambda x: ' '.join([re.sub(r"[^a-zA-Z0-9]", " ", word.lower()) for word in x.split() ]))
business_df['Text'] = business_df['Text'].apply(lambda x: ' '.join([re.sub(r"[^a-zA-Z0-9]", " ", word.lower()) for word in x.split()] ))
politics_df['Text'] = politics_df['Text'].apply(lambda x: ' '.join([re.sub(r"[^a-zA-Z0-9]", " ", word.lower()) for word in x.split()] ))
entertainment_df['Text'] = entertainment_df['Text'].apply(lambda x: ' '.join([re.sub(r"[^a-zA-Z0-9]", " ", word.lower()) for word in x.split()] ))
tech_df['Text'] = tech_df['Text'].apply(lambda x: ' '.join([re.sub(r"[^a-zA-Z0-9]", " ", word.lower()) for word in x.split() ]))

In [72]:
print("100 Most Commonly Used Words For Each Category","\n")
print("Sport: ",Counter(" ".join(sport_df["Text"]).split()).most_common(100),"\n")
print("Business: ",Counter(" ".join(business_df["Text"]).split()).most_common(100),"\n")
print("Politics: ",Counter(" ".join(politics_df["Text"]).split()).most_common(100),"\n")
print("Entertainment: ",Counter(" ".join(entertainment_df["Text"]).split()).most_common(100),"\n")
print("Tech: ",Counter(" ".join(tech_df["Text"]).split()).most_common(100),"\n")

100 Most Commonly Used Words For Each Category 

Sport:  [('the', 6620), ('to', 3189), ('a', 2651), ('and', 2532), ('in', 2510), ('of', 1826), ('s', 1440), ('i', 1304), ('for', 1127), ('he', 1105), ('on', 1014), ('but', 992), ('is', 985), ('it', 974), ('was', 943), ('that', 863), ('have', 812), ('with', 803), ('at', 794), ('his', 762), ('we', 660), ('has', 650), ('said', 636), ('be', 614), ('will', 575), ('as', 547), ('not', 490), ('from', 481), ('after', 477), ('by', 430), ('they', 414), ('had', 414), ('their', 381), ('been', 363), ('are', 356), ('game', 356), ('an', 353), ('this', 353), ('out', 351), ('first', 350), ('year', 331), ('england', 329), ('who', 324), ('t', 322), ('against', 312), ('time', 296), ('win', 295), ('when', 295), ('up', 294), ('two', 290), ('world', 269), ('all', 268), ('over', 267), ('there', 264), ('back', 263), ('last', 262), ('one', 261), ('if', 255), ('6', 252), ('she', 244), ('you', 244), ('would', 233), ('can', 230), ('before', 225), ('were', 225), ('her'

In this part, we divided the texts that we pulled from the dataset into words one by one. Then, we have listed the 100 most commonly used words in their categories above. Below we have listed 3 words that are most common for one category, but not so common for other categories.

Sports ('game', 356) ('win', 295) ('cup', 206)
<br>
Business ('market', 284) ('firm', 261) ('growth', 257)
<br>
Politics ('labour', 494) ('government', 464) ('election', 424)
<br>
Entertainment ('film', 583) ('music', 255) ('awards', 184)
<br>
Tech ('mobile', 343) ('technology', 303) ('users', 268)

## PART II

In [44]:
def read_and_edit_dataset(dataset):
    df = pd.read_csv(dataset).drop(["ArticleId"],axis=1).sample(frac=1)
    df['Text'] = df['Text'].apply(lambda x: ' '.join([re.sub(r"[^a-zA-Z]", " ", word.lower()) for word in x.split() ]))
    texts = df['Text'].values
    categories = df['Category'].values
    labels = list(set(categories))
    return texts,categories,labels

In [45]:
def split_data(texts,categories):
    X_train, X_test, y_train, y_test = train_test_split(texts, categories, test_size=0.3, random_state=42)
    return X_train, X_test, y_train, y_test

##### Bag Of Words
A bag-of-words model, or BoW for short, is a way of extracting features from text for use in modeling, such as with machine learning algorithms. The approach is very simple and flexible, and can be used in a myriad of ways for extracting features from documents.

A bag-of-words is a representation of text that describes the occurrence of words within a document. It involves two things:

1. A vocabulary of known words.
2. A measure of the presence of known words.

It is called a “bag” of words, because any information about the order or structure of words in the document is discarded. The model is only concerned with whether known words occur in the document, not where in the document.

In [46]:
def execute_BoW(X_train, y_train, labels, slice_percent=70, tf_idf_transformer=False, stopwords=False, option="unigram"):
    
    if(stopwords):
        if (option=="unigram"):
            vectorizer = CountVectorizer(ngram_range=(1,1),stop_words=ENGLISH_STOP_WORDS)
        else:
            vectorizer = CountVectorizer(ngram_range=(2,2),stop_words=ENGLISH_STOP_WORDS)
            
    else:
        if (option=="unigram"):
            vectorizer = CountVectorizer(ngram_range=(1,1))
        else:
            vectorizer = CountVectorizer(ngram_range=(2,2))
        
    X = vectorizer.fit_transform(X_train)
        
    if (tf_idf_transformer):
        tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True) 
        tfidf_transformer.fit(X)
        tfidf_dict = dict(zip(vectorizer.get_feature_names(), tfidf_transformer.idf_))
        sorted_tfidf_dict = dict(sorted(tfidf_dict.items(), key=lambda item: item[1],reverse=True))
        vocabulary = list(sorted_tfidf_dict.keys())[:int(len(sorted_tfidf_dict.keys()) * slice_percent/100)]         
        vocabulary = np.array([item for item in vectorizer.get_feature_names() if item in vocabulary])
        
        X_train_copy=X_train.copy()
        for i in range(len(X_train_copy)):
            X_train_copy[i] = ' '.join([word for word in X_train_copy[i].split() if word in vocabulary])
               
        X = vectorizer.fit_transform(X_train_copy)
    else:
        vocabulary = vectorizer.get_feature_names()
        
    X = X.toarray()
    words_and_counts = {}
    for l in labels:
        words_and_counts[l] = defaultdict(lambda: 0)
    for i in range(X.shape[0]):
        l = y_train[i]
        for j in range(len(vocabulary)):
            words_and_counts[l][vocabulary[j]] += X[i][j]
    return words_and_counts,vocabulary

#### Naive Bayes Classifier

A Naive Bayes classifier is a probabilistic machine learning model that’s used for classification task. The crux of the classifier is based on the Bayes theorem.

Using Bayes theorem, we can find the probability of A happening, given that B has occurred. Here, B is the evidence and A is the hypothesis. The assumption made here is that the predictors/features are independent. That is presence of one particular feature does not affect the other. Hence it is called naive.

In [47]:
class NaiveBayes:
    
    def __init__(self,labels):
        self.labels=labels
    
    def laplace_smoothing(self,n_label_items, vocabulary, word_counts, word, text_label):
        a = word_counts[text_label][word] + 1
        b = n_label_items[text_label] + len(vocabulary)
        return math.log(a/b)

    def group_by_label(self,x, y):
        data = {}
        for l in self.labels:
            data[l] = x[np.where(y == l)]
        return data
    
    def fit(self,x, y):
        n_label_items = {}
        log_label_priors = {}
        n = len(x)
        grouped_data = self.group_by_label(x, y)
        for l, data in grouped_data.items():
            n_label_items[l] = len(data)
            log_label_priors[l] = math.log(n_label_items[l] / n)
        return n_label_items, log_label_priors

    def predict(self,n_label_items, vocabulary, word_counts, log_label_priors, x_test, option="unigram"):
        result = []
        for text in x_test:
            label_scores = {l: log_label_priors[l] for l in self.labels}
            if (option == "unigram"):
                words = set(text.split())
            else:
                words = text.split()
                words = [words[n:n+2] for n in range(0, len(words), 2)]
                new_words = list()
                for i in words:
                    new_words.append(' '.join(map(str, i)))
                    
                words=new_words
                
            for word in words:
                if word not in vocabulary: continue
                for l in self.labels:
                    log_w_given_l = self.laplace_smoothing(n_label_items, vocabulary, word_counts, word, l)
                    label_scores[l] += log_w_given_l

            result.append(max(label_scores, key=label_scores.get))
        return result

In [52]:
texts, categories, labels = read_and_edit_dataset("English_Dataset.csv")
X_train, X_test, y_train, y_test = split_data(texts, categories)

The accuracy value when we use BoW with "unigram" while not using TF-IDF or stop words for our word list:

In [53]:
unigram_words_and_counts, vocabulary = execute_BoW(X_train,y_train,labels,option="unigram")
naive_bayes = NaiveBayes(labels)
label_items, log_label_priors = naive_bayes.fit(X_train,y_train)
unigram_y_pred = naive_bayes.predict(label_items, vocabulary, unigram_words_and_counts, log_label_priors, X_test,option="unigram")
print("Unigram Accuracy : ", accuracy_score(y_test,unigram_y_pred))

Unigram Accuracy :  0.9060402684563759


The accuracy value when we use BoW with "bigram" while not using TF-IDF or stop words for our word list:

In [54]:
bigram_words_and_counts, vocabulary = execute_BoW(X_train,y_train,labels,option="bigram")
naive_bayes = NaiveBayes(labels)
label_items, log_label_priors = naive_bayes.fit(X_train,y_train)
bigram_y_pred = naive_bayes.predict(label_items, vocabulary, bigram_words_and_counts, log_label_priors, X_test,option="biagram")
print("Bigram Accuracy : ", accuracy_score(y_test,bigram_y_pred))

Bigram Accuracy :  0.8993288590604027


## PART III

   ####  Part A

Term frequency–inverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus. It is often used as a weighting factor in searches of information retrieval, text mining, and user modeling. The tf–idf value increases proportionally to the number of times a word appears in the document and is offset by the number of documents in the corpus that contain the word, which helps to adjust for the fact that some words appear more frequently in general.

We calculated the weights of the words in the texts using TF-IDF and retrained our model by removing the 10% with the lowest weight from our word list. In this way, the terms that allowed us to determine better the categories of the articles remained in the word list that we used to train the model.

Increasing or decreasing this percentage gave us worse accuracy results than we have now. For example, if we removed the 90% part with less weight, this time we would have very few words to train the model and our accuracy value would decrease. Or, on the contrary, if we removed the less weighted 1%, this time, the words common in each category would remain in our word list and our accuracy value would decrease again.

By using TF-IDF in the most optimal way, we have increased the accuracy value we obtained without using it at all.

The accuracy value when we use TF-IDF but not stop words for our word list:

In [69]:
words_and_counts, vocabulary= execute_BoW(X_train, y_train, labels, slice_percent=90, tf_idf_transformer=True)
naive_bayes = NaiveBayes(labels)
label_items, log_label_priors = naive_bayes.fit(X_train,y_train)
y_pred = naive_bayes.predict(label_items, vocabulary, words_and_counts, log_label_priors, X_test)
print("TF-IDF Accuracy : ", accuracy_score(y_test,y_pred))

TF-IDF Accuracy :  0.930648769574944


In [56]:
sport_df_voc = df[df['Category'] == "sport"]
business_df_voc = df[df['Category'] == "business"]
politics_df_voc = df[df['Category'] == "politics"]
entertainment_df_voc = df[df['Category'] == "entertainment"]
tech_df_voc = df[df['Category'] == "tech"]

In [57]:
sport_df_voc['Text'] = sport_df['Text'].apply(lambda x: ' '.join([re.sub(r"[^a-zA-Z0-9]", " ", word.lower()) for word in x.split() if word in vocabulary ]))
business_df_voc['Text'] = business_df['Text'].apply(lambda x: ' '.join([re.sub(r"[^a-zA-Z0-9]", " ", word.lower()) for word in x.split() if word in vocabulary]))
politics_df_voc['Text'] = politics_df['Text'].apply(lambda x: ' '.join([re.sub(r"[^a-zA-Z0-9]", " ", word.lower()) for word in x.split() if word in vocabulary]))
entertainment_df_voc['Text'] = entertainment_df['Text'].apply(lambda x: ' '.join([re.sub(r"[^a-zA-Z0-9]", " ", word.lower()) for word in x.split() if word in vocabulary]))
tech_df_voc['Text'] = tech_df['Text'].apply(lambda x: ' '.join([re.sub(r"[^a-zA-Z0-9]", " ", word.lower()) for word in x.split() if word in vocabulary]))

In [58]:
print("10 Words Whose Presence Most Strongly Sredicts","\n")
print("Sport: ",Counter(" ".join(sport_df_voc["Text"]).split()).most_common(10),"\n")
print("Business: ",Counter(" ".join(business_df_voc["Text"]).split()).most_common(10),"\n")
print("Politics: ",Counter(" ".join(politics_df_voc["Text"]).split()).most_common(10),"\n")
print("Entertainment: ",Counter(" ".join(entertainment_df_voc["Text"]).split()).most_common(10),"\n")
print("Tech: ",Counter(" ".join(tech_df_voc["Text"]).split()).most_common(10),"\n")

10 Words Whose Presence Most Strongly Sredicts 

Sport:  [('roddick', 97), ('ferguson', 62), ('nadal', 59), ('mourinho', 58), ('holmes', 57), ('wenger', 55), ('newcastle', 53), ('gara', 48), ('athletics', 47), ('hewitt', 46)] 

Business:  [('yukos', 122), ('airline', 57), ('gm', 55), ('worldcom', 54), ('mci', 53), ('deutsche', 53), ('glazer', 51), ('lse', 51), ('imf', 50), ('ebbers', 45)] 

Politics:  [('ukip', 101), ('kilroy', 88), ('asylum', 80), ('silk', 79), ('wage', 56), ('minimum', 55), ('clarke', 54), ('blunkett', 49), ('advice', 46), ('hunting', 45)] 

Entertainment:  [('foxx', 48), ('vera', 40), ('sideways', 40), ('drake', 39), ('lee', 39), ('elvis', 38), ('staunton', 35), ('novel', 34), ('swank', 34), ('concert', 33)] 

Tech:  [('spam', 99), ('gadget', 87), ('bt', 79), ('spyware', 67), ('mac', 63), ('blogs', 62), ('definition', 59), ('nintendo', 58), ('ipod', 53), ('xbox', 52)] 



In [59]:
sport_df_not_voc = df[df['Category'] == "sport"]
business_df_not_voc = df[df['Category'] == "business"]
politics_df_not_voc = df[df['Category'] == "politics"]
entertainment_df_not_voc = df[df['Category'] == "entertainment"]
tech_df_not_voc = df[df['Category'] == "tech"]

In [60]:
sport_df_not_voc['Text'] = sport_df['Text'].apply(lambda x: ' '.join([re.sub(r"[^a-zA-Z0-9]", " ", word.lower()) for word in x.split() if word in vocabulary and word not in ENGLISH_STOP_WORDS]))
business_df_not_voc['Text'] = business_df['Text'].apply(lambda x: ' '.join([re.sub(r"[^a-zA-Z0-9]", " ", word.lower()) for word in x.split() if (word in vocabulary and word not in ENGLISH_STOP_WORDS)]))
politics_df_not_voc['Text'] = politics_df['Text'].apply(lambda x: ' '.join([re.sub(r"[^a-zA-Z0-9]", " ", word.lower()) for word in x.split() if (word in vocabulary and word not in ENGLISH_STOP_WORDS)]))
entertainment_df_not_voc['Text'] = entertainment_df['Text'].apply(lambda x: ' '.join([re.sub(r"[^a-zA-Z0-9]", " ", word.lower()) for word in x.split() if word in vocabulary and word not in ENGLISH_STOP_WORDS]))
tech_df_not_voc['Text'] = tech_df['Text'].apply(lambda x: ' '.join([re.sub(r"[^a-zA-Z0-9]", " ", word.lower()) for word in x.split() if (word in vocabulary and word not in ENGLISH_STOP_WORDS)]))

In [61]:
print("10 Words Whose Absence Most Strongly Sredicts","\n")
print("Sport: ",Counter(" ".join(sport_df_not_voc["Text"]).split()).most_common()[-10:],"\n")
print("Business: ",Counter(" ".join(business_df_not_voc["Text"]).split()).most_common()[-10:],"\n")
print("Politics: ",Counter(" ".join(politics_df_not_voc["Text"]).split()).most_common()[-10:],"\n")
print("Entertainment: ",Counter(" ".join(entertainment_df_not_voc["Text"]).split()).most_common()[-10:],"\n")
print("Tech: ",Counter(" ".join(tech_df_not_voc["Text"]).split()).most_common()[-10:],"\n")

10 Words Whose Absence Most Strongly Sredicts 

Sport:  [('cent', 1), ('harbours', 1), ('telegraph', 1), ('prolonged', 1), ('shelf', 1), ('pledge', 1), ('scotch', 1), ('spectacular', 1), ('ambitious', 1), ('satisfy', 1)] 

Business:  [('patterns', 1), ('eat', 1), ('gregg', 1), ('virgin', 1), ('chennai', 1), ('nadu', 1), ('desperately', 1), ('chung', 1), ('mong', 1), ('koo', 1)] 

Politics:  [('astonishment', 1), ('beggared', 1), ('principally', 1), ('spanning', 1), ('csa', 1), ('knighthoods', 1), ('undertook', 1), ('respectively', 1), ('natwest', 1), ('exams', 1)] 

Entertainment:  [('motor', 1), ('motors', 1), ('competitions', 1), ('click', 1), ('reliance', 1), ('compact', 1), ('formats', 1), ('tastes', 1), ('outgoing', 1), ('cleaner', 1)] 

Tech:  [('expanded', 1), ('sealing', 1), ('unwelcome', 1), ('replaces', 1), ('randomly', 1), ('tailed', 1), ('sharply', 1), ('reviewing', 1), ('procedures', 1), ('defacement', 1)] 



   ####  Part B

Stop words are the words in a stop list (or stoplist or negative dictionary) which are filtered out (i.e. stopped) before or after processing of natural language data (text) because they are insignificant.[1] There is no single universal list of stop words used by all natural language processing tools, nor any agreed upon rules for identifying stop words, and indeed not all tools even use such a list. Therefore, any group of words can be chosen as the stop words for a given purpose.

In this model, we used Scikit Learn's stop words as stop words. Using these words, we have removed the words that are commonly used to make sentences in English and that does not give us any information to separate the categories from our word list. In this way, we trained our model with a word list of words that are less in number and more informative to distinguish between categories.

When we trained our model with the word list from which stop words were removed, we saw that it gave better accuracy than both the model trained with the word list we created without any elimination and the model trained with the word list created with TF-IDF.

The accuracy value when we use stop words but not TF-IDF for our word list:

In [68]:
words_and_counts, vocabulary= execute_BoW(X_train, y_train, labels, stopwords=True)
naive_bayes = NaiveBayes(labels)
label_items, log_label_priors = naive_bayes.fit(X_train,y_train)
y_pred = naive_bayes.predict(label_items, vocabulary, words_and_counts, log_label_priors, X_test)
print("Stop Words Accuracy : ", accuracy_score(y_test,y_pred))

Stop Words Accuracy :  0.9351230425055929


In [64]:
sport_df_sw = df[df['Category'] == "sport"]
business_df_sw = df[df['Category'] == "business"]
politics_df_sw = df[df['Category'] == "politics"]
entertainment_df_sw = df[df['Category'] == "entertainment"]
tech_df_sw = df[df['Category'] == "tech"]

In [65]:
sport_df_sw['Text'] = sport_df['Text'].apply(lambda x: ' '.join([re.sub(r"[^a-zA-Z0-9]", " ", word.lower()) for word in x.split() if word in vocabulary]))
business_df_sw['Text'] = business_df['Text'].apply(lambda x: ' '.join([re.sub(r"[^a-zA-Z0-9]", " ", word.lower()) for word in x.split() if word in vocabulary]))
politics_df_sw['Text'] = politics_df['Text'].apply(lambda x: ' '.join([re.sub(r"[^a-zA-Z0-9]", " ", word.lower()) for word in x.split() if word in vocabulary]))
entertainment_df_sw['Text'] = entertainment_df['Text'].apply(lambda x: ' '.join([re.sub(r"[^a-zA-Z0-9]", " ", word.lower()) for word in x.split() if word in vocabulary]))
tech_df_sw['Text'] = tech_df['Text'].apply(lambda x: ' '.join([re.sub(r"[^a-zA-Z0-9]", " ", word.lower()) for word in x.split() if word in vocabulary]))

In [66]:
print("10 Non-Stopwords That Most Strongly Predict","\n")
print("Sport: ",Counter(" ".join(sport_df_sw["Text"]).split()).most_common(10),"\n")
print("Business: ",Counter(" ".join(business_df_sw["Text"]).split()).most_common(10),"\n")
print("Politics: ",Counter(" ".join(politics_df_sw["Text"]).split()).most_common(10),"\n")
print("Entertainment: ",Counter(" ".join(entertainment_df_sw["Text"]).split()).most_common(10),"\n")
print("Tech: ",Counter(" ".join(tech_df_sw["Text"]).split()).most_common(10),"\n")

10 Non-Stopwords That Most Strongly Predict 

Sport:  [('said', 636), ('game', 356), ('year', 331), ('england', 329), ('time', 296), ('win', 295), ('world', 269), ('players', 209), ('cup', 206), ('team', 205)] 

Business:  [('said', 1100), ('year', 456), ('mr', 393), ('market', 284), ('new', 273), ('firm', 261), ('growth', 257), ('company', 253), ('economy', 233), ('government', 215)] 

Politics:  [('said', 1445), ('mr', 1073), ('labour', 494), ('government', 464), ('election', 424), ('blair', 395), ('party', 376), ('people', 372), ('minister', 286), ('new', 280)] 

Entertainment:  [('said', 594), ('film', 583), ('best', 430), ('year', 315), ('music', 255), ('new', 234), ('awards', 184), ('uk', 171), ('actor', 169), ('number', 165)] 

Tech:  [('said', 1064), ('people', 647), ('new', 349), ('mr', 349), ('mobile', 343), ('technology', 303), ('users', 268), ('software', 265), ('use', 260), ('net', 256)] 



## PART IV

In this part, we trained our model with the word list that we created using both TF-IDF and stop words. Our model trained with our word list created with these two features gave higher accuracy than the models trained with other word lists. In this way, we have shown with numerical data that we have narrowed our word list in the most optimal way for our model.

The accuracy value when we use stop words and TF-IDF for our word list:

In [67]:
words_and_counts, vocabulary= execute_BoW(X_train, y_train, labels,slice_percent=90, tf_idf_transformer=True, stopwords=True)
naive_bayes = NaiveBayes(labels)
label_items, log_label_priors = naive_bayes.fit(X_train,y_train)
y_pred = naive_bayes.predict(label_items, vocabulary, words_and_counts, log_label_priors, X_test)
print("Accuracy : ", accuracy_score(y_test,y_pred))

Accuracy :  0.9463087248322147
