# Machine Learning Lab 11 - Arailym Kaiyrova
### Exercise 0: Preprocessing Text Data

Firstly, news groups data was loaded from sklearn library, and only for categories 'comp.graphics', 'sci.med'

In [16]:
from sklearn.datasets import fetch_20newsgroups
categories = ['comp.graphics', 'sci.med']
newsgroups2 = fetch_20newsgroups(categories=categories)

Next part is data cleaning, removing punctuation, stopwords from nltk library, and tokenize data, and store the list of words for each document in list documents.\
'all_words' is a set of all unique words from all documents, which will be neccessary in following tasks

In [515]:
import nltk
from nltk.corpus import stopwords
import string

all_words = set([])
documents = []

for i, doc in enumerate(newsgroups2.data):
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    new_words = tokenizer.tokenize(doc.lower())

    stop_words = set(stopwords.words('english'))
    
    #add the word to the list if its not stopword and not numeric value, and length larger than 1
    new_words = [w for w in new_words if (w not in stop_words and w.isalpha() and len(w)>1)]
    documents.append(new_words)
    
    all_words.update(new_words)

Next task is to create bag of words for each documents, and the function 'calculateBOW' creates dictionary of word frequencies for a given document

In [122]:
def calculateBOW(wordset,l_doc):
    tf_diz = dict.fromkeys(wordset,0)
    for word in l_doc:
        tf_diz[word]=l_doc.count(word)
    tf_diz['total_count'] = float(len(l_doc))
    return tf_diz

And collection of all word frequencies for all documents is stored in a pandas dataframe 'df'

In [123]:
import pandas as pd
df = pd.DataFrame([])

for i, doc in enumerate(documents):
    wordDict = calculateBOW(all_words,doc)
    df = df.append(pd.DataFrame(wordDict, index = ['Document'+str(i)]))

df

Unnamed: 0,heightfield,interleaf,oppose,clut,installworldobjects,freehand,experimentation,unreal,grave,netcom,...,gnv,orders,regional,aga,mirroring,debunking,idealist,moreillo,systematic,total_count
Document0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,115.0
Document1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,77.0
Document2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,287.0
Document3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,110.0
Document4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,50.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Document1173,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,204.0
Document1174,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,59.0
Document1175,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,128.0
Document1176,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,69.0


Next step is to create term frequencies, or normalized word frequencies, and result is stored in dataframe 'tf'

In [340]:
tf = pd.DataFrame([], columns = list(all_words))
for word in all_words:
    tf[word] = df[word]/df['total_count']

tf['total_count'] = df['total_count']

In [516]:
tf.head()

Unnamed: 0,heightfield,interleaf,oppose,clut,installworldobjects,freehand,experimentation,unreal,grave,netcom,...,orders,regional,aga,mirroring,debunking,idealist,moreillo,systematic,total_count,data_target
Document0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,115.0,0
Document1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,77.0,1
Document2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,287.0,0
Document3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,110.0,1
Document4,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,0


Then idf value was calculated for all words in the dataset according to the formula idf(word) = log(number of documents/number of documents containing the word)

In [126]:
import numpy as np
IDF = pd.DataFrame([])
N = len(df)
for i, word in enumerate(all_words):
    idf = np.log(N/df[df[word]!=0][word].count())
    IDF = IDF.append(pd.DataFrame([[word,idf]], index = [i]))
    
IDF.head()

Unnamed: 0,0,1
0,heightfield,6.378426
1,interleaf,5.972961
2,oppose,6.378426
3,clut,7.071573
4,installworldobjects,7.071573
...,...,...
20694,mirroring,7.071573
20695,debunking,6.378426
20696,idealist,6.378426
20697,moreillo,7.071573


After that for tf_idf all the term frequencies were multipled by the idf as can be seen below

In [349]:
tf_idf = pd.DataFrame([], columns = list(all_words))
for word in all_words:
    idf = IDF[IDF[0]==word].values[0][1]
    tf_idf[word] = tf[word]*idf
    
tf_idf['total_count'] = df['total_count']

7.071573364211532


In [350]:
tf_idf.head()

Unnamed: 0,heightfield,interleaf,oppose,clut,installworldobjects,freehand,experimentation,unreal,grave,netcom,...,gnv,orders,regional,aga,mirroring,debunking,idealist,moreillo,systematic,total_count
Document0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,115.0
Document1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,77.0
Document2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,287.0
Document3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,110.0
Document4,0.0,0.0,0.0,0.0,0.0,0.0,0.127569,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0


Next step is to divide data into train/test/validation sets according to the ratio 0.8/0.1/0.1.\
Also, to have the equal value of for each category in all datasets, data was separated into two categories and splited into train/test/validation sets.\
Also, for implementation of Naive Bayes, we need two separately consider data for each category, that's why train data for each category was stored separately

In [498]:
def split_train_test(df, frac):
    train=df.sample(frac=frac,random_state=200) #random state is a seed value
    test=df.drop(train.index)
    return train, test

def split_data(df):
    df['data_target'] = newsgroups2.target

    category_1 = df[df['data_target']==1]
    category_2 = df[df['data_target']==0]

    cat1_train, cat1_test = split_train_test(category_1, 0.8)
    cat1_test, cat1_val = split_train_test(cat1_test, 0.5) 

    cat2_train, cat2_test = split_train_test(category_2, 0.8)
    cat2_test, cat2_val = split_train_test(cat2_test, 0.5) 

    test = pd.concat([cat1_test, cat2_test])
    val = pd.concat([cat1_val, cat2_val])
    
    return cat1_train, cat2_train, val, test

### Exercise 1: Implementing Naive Bayes Classifier for Text Data

For Naive Bayes Classifier, we need to calculate contional probablilites of all existing words for each category, which is done by 'calculateTF' method below. \

Condional probabilities were calculated based of regular frequency values, which is stored in table tf.

In [499]:
cat1_train, cat2_train, val, test = split_data(tf)

def calculateTF(wordset, dataset):
    tf = {}
    length = dataset['total_count'].sum()+len(wordset)
    for word in wordset:
        #Normalized frequency
        tf[word]=(dataset[word].sum()+1)/length
    return tf

p_cat1 = len(cat1_train)/(len(cat1_train)+len(cat2_train))
p_cat2 = len(cat2_train)/(len(cat1_train)+len(cat2_train))
tf_cat1, tf_cat2 = calculateTF(all_words, cat1_train), calculateTF(all_words, cat2_train)

Based on condional probabilities in calculated previously, now we can calculate the probability of belonging to specific category of unseen data. So, we calculate its probability of belonging to each category using method 'check_if_cat', and assign the predicted output to the category with higher probability.

In [517]:
import math
def check_if_cat(p_cat, tf_cat, word_list):
    p = p_cat
    for word, freq in tf_cat.items():
        x = pow(freq, word_list[word])
        p*=x
    return p

Then compare the predicted target values with actual values, and calculate the accuracy, and it can be seen that for validation data accuracy was 62% and for test data it is 69%.

In [501]:
def calculate_acccuracy(p_cat1,p_cat2,tf_cat1,tf_cat2,val):
    correct = 0
    for i in range(len(val)):
        word_list = val.iloc[i].to_dict()
        p1 = check_if_cat(p_cat1, tf_cat1, word_list)
        p2 = check_if_cat(p_cat2, tf_cat2, word_list)
        pred = 1 if p1>p2 else 0
        actual = val.iloc[i].values[-1]
        if pred==actual:correct+=1   
    return correct/len(val)
val_acc = calculate_acccuracy(p_cat1,p_cat2,tf_cat1,tf_cat2,val)
test_acc = calculate_acccuracy(p_cat1,p_cat2,tf_cat1,tf_cat2,test)

print('Accuracy of validation set using after bag of words: ', val_acc)
print('Accuracy of test set using after bag of words: ', test_acc)

Accuracy of validation set using after bag of words:  0.6271186440677966
Accuracy of test set using after bag of words:  0.6949152542372882


Next task is to complete all the previous steps for predicting the output using Naive Bayes Classifies, but instead of using regular frequencies, tf-idf data was used as can bee seen below.\
\
And this approach gives much higher accuracy for both validation and test sets which is above 92%.

In [518]:
cat1_train, cat2_train, val, test = split_data(tf_idf)

p_cat1 = len(cat1_train)/(len(cat1_train)+len(cat2_train))
p_cat2 = len(cat2_train)/(len(cat1_train)+len(cat2_train))
tf_cat1, tf_cat2 = calculateTF(all_words, cat1_train), calculateTF(all_words, cat2_train)

val_acc = calculate_acccuracy(p_cat1,p_cat2,tf_cat1,tf_cat2,val)
test_acc = calculate_acccuracy(p_cat1,p_cat2,tf_cat1,tf_cat2,test)

print('Accuracy of validation set using after TF-IDF: ', val_acc)
print('Accuracy of test set using after TF-IDF: ', test_acc)

Accuracy of validation set using after TF-IDF:  0.923728813559322
Accuracy of test set using after TF-IDF:  0.9322033898305084


### Exercise 2: Implementing SVM Classifier via Scikit-Learn

In this task we should use SVM for classifying the data. Since we don't need to consider each category separately as for Naive Bayes, training data from each category were combined back into one dataset.

In [510]:
cat1_train, cat2_train, val, test = split_data(df)
train = pd.concat([cat1_train, cat2_train])
X, y = train.iloc[:,range(20699)], train['data_target']

In [511]:
def calculate_accuracy(pred, test):
    correct = 0
    for i in range(len(pred)):
        if pred[i]==test['data_target'].values[i]:
            correct+=1
    return correct/len(pred)

To identify the best hyperparameters, space of different hyperparameter combinations were created, and 10 of the were randomly selected for validation, trained using SVM model built in sklearn library, and the best parameters were selected that gives higher validation accuracy.

In [512]:
import random
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
C = [0.1, 1, 100]
gammas = [0.1, 5, 10]
kernels = ['linear','poly','rbf']
space = []
for c in C:
    for g in gammas:
        for k in kernels:
            space.append((c,g,k))
params = random.sample(range(0, len(space)), 10)

best_acc, best_params = -float('inf'), None
for p in params:
    c,g,k = space[p]
    clf = make_pipeline(StandardScaler(), SVC(gamma=g,kernel=k,C=c))
    clf.fit(X, y)
    pred = clf.predict(val.iloc[:,range(20699)].values)
    accuracy = calculate_accuracy(pred, val)
    
    print('For C, gamma, kernel type', c, g, k, 'validation accuracy equals: ', accuracy)
    if accuracy>best_acc:
        best_acc, best_params = accuracy, space[p]

For C, gamma, kernel type 1 5 poly validation accuracy equals:  0.6271186440677966
For C, gamma, kernel type 0.1 0.1 linear validation accuracy equals:  0.9491525423728814
For C, gamma, kernel type 0.1 0.1 rbf validation accuracy equals:  0.5
For C, gamma, kernel type 100 5 rbf validation accuracy equals:  0.5
For C, gamma, kernel type 0.1 10 rbf validation accuracy equals:  0.5
For C, gamma, kernel type 100 5 poly validation accuracy equals:  0.6271186440677966
For C, gamma, kernel type 1 10 poly validation accuracy equals:  0.6271186440677966
For C, gamma, kernel type 1 0.1 linear validation accuracy equals:  0.9491525423728814
For C, gamma, kernel type 100 0.1 linear validation accuracy equals:  0.9491525423728814
For C, gamma, kernel type 100 10 linear validation accuracy equals:  0.9491525423728814


And it can be seen that for C = 0.1, gamma = 0.1, and 'linear' kernel type, model gives higher accuracy.

In [519]:
best_params

(0.1, 0.1, 'linear')

Below test accuracy was calculated for model with selected best hyperparameters, and it gives high accuracy, which is above 95 percent.

In [514]:
clf = make_pipeline(StandardScaler(), SVC(gamma=best_params[1],kernel=best_params[2],C=best_params[0]))
clf.fit(X, y)
pred = clf.predict(test.iloc[:,range(20699)].values)
accuracy = calculate_accuracy(pred, test)
    
print('With selected best parameters C, gamma, kernel type', c, g, k, 'test accuracy equals: ', accuracy)

With selected best parameters C, gamma, kernel type 100 10 linear test accuracy equals:  0.9576271186440678
