Assignment 2 - COMPSCI762 - Asif Cheena (615115260) - Naives Bayes Implementation

In [61]:
#Libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import scipy
import nltk
from nltk.corpus import stopwords
import string
from sklearn.metrics import accuracy_score, classification_report
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from builtins import breakpoint
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from scipy.sparse import csr_matrix, hstack
from sklearn.feature_selection import SelectKBest, mutual_info_classif

In [11]:
#Label Imbalance Check
label_imbalance = train['category'].value_counts()
total_count = label_imbalance.sum()
percentage_all = (label_imbalance / total_count) * 100
print("Proportion of frequency of unique values in all columns as percentage:")
print(percentage_all)

Proportion of frequency of unique values in all columns as percentage:
Restaurants    62.191684
Shopping       26.427061
Nightlife      11.381254
Name: category, dtype: float64


NB Model, Feature Engineering and Preprocessing

In [120]:
###Preprocessing Review Data###
def process_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert to lowercase
    text = text.lower()
    return text

def split_words(text):
    words = text.split()
    return words

def RemoveStopWords(review):
    # Set of commonly occuring stopwords
    StopWords = list(stopwords.words("english"))
    StopWords.extend(['soooo'])
    review = [stopword for stopword in review if stopword not in StopWords]
    return review

def StemWords(review):
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in review]
    return stemmed_words

def ApplyPreprocessingSteps (X_train, X_test):
    ###Reformatting feature before fitting model###
    X_train['review'] = X_train['review'].apply(process_text)
    X_train['review'] = X_train['review'].apply(split_words)
    X_train['review'] = X_train['review'].apply(RemoveStopWords)
    X_train['review'] = X_train['review'].apply(StemWords)

    X_test['review'] = X_test['review'].apply(process_text)
    X_test['review'] = X_test['review'].apply(split_words)
    X_test['review'] = X_test['review'].apply(RemoveStopWords)
    X_test['review'] = X_test['review'].apply(StemWords)

    ID_train = X_train['ID']
    ID_test = X_test['ID']

    X_train['review'] = X_train['review'].apply(lambda x: ' '.join(x))
    X_test['review'] = X_test['review'].apply(lambda x: ' '.join(x))

    X_train_all = X_train
    X_test_all = X_test

    X_train = X_train['review']
    X_test = X_test['review']

    return X_train, X_test, ID_train, ID_test, X_train_all, X_test_all
    


###Bag of Words Feature Implementation###
def BoW_Feature(X_train, X_test):
# Create a CountVectorizer to convert the text into word count features
    vectorizer = CountVectorizer()

    vectorizer.fit(X_train)
    X_train_counts = vectorizer.fit_transform(X_train)

    vocabulary = vectorizer.vocabulary_
    X_test_counts = CountVectorizer(vocabulary=vocabulary).fit_transform(X_test)

    return X_train_counts, X_test_counts
     


###Bigrams Feature Implementation###
def Bigrams_Feature(X_train, X_test):
    vectorizer = CountVectorizer(ngram_range=(2, 2))

    vectorizer.fit(X_train)
    X_train_counts = vectorizer.fit_transform(X_train)
    
    vocabulary = vectorizer.vocabulary_
    X_test_counts = CountVectorizer(vocabulary=vocabulary).fit_transform(X_test)

    return X_train_counts, X_test_counts

### BoW Feature for Name Attribute ###
# Preprocess the text data
def BoW_Names_Feature(X_train_all, X_test_all, X_train_counts, X_test_counts):
    #Training set
    corpus = X_train_all['name'].tolist()
    vocabulary = create_vocabulary(corpus)
    X = word_count_corpus(corpus, vocabulary)
    name_BoW_train = pd.DataFrame(data = X, columns = vocabulary)
    X_train_counts = pd.concat([X_train_counts, name_BoW_train], axis = 1)

    #Test set
    corpus = X_test_all['name'].tolist()
    vocabulary = create_vocabulary(corpus)
    X = word_count_corpus(corpus, vocabulary)
    name_BoW_test = pd.DataFrame(data = X, columns = vocabulary)
    X_test_counts = pd.concat([X_test_counts, name_BoW_test], axis = 1)

    return X_train_counts, X_test_counts

def preprocess(text):
    stopwords = set(nltk.corpus.stopwords.words('english'))
    stemmer = nltk.stem.PorterStemmer()
    words = nltk.word_tokenize(text.lower())
    words = [stemmer.stem(word) for word in words if word.isalpha() and word not in stopwords]
    return words

# Create a vocabulary of unique words in the corpus
def create_vocabulary(corpus, min_df):
    vocabulary = set()
    doc_freq = Counter()
    for text in corpus:
        words = preprocess(text)
        doc_freq.update(set(words))
        vocabulary.update(words)
    vocab_size = len(vocabulary)
    vocab_freq = {word: freq / vocab_size for word, freq in doc_freq.items()}
    vocabulary = {word for word in vocabulary if vocab_freq[word] >= min_df}
    return vocabulary

# Create a word count frequency feature for a single document
def word_count(text, vocabulary):
    word_counts = Counter(preprocess(text))
    feature_vector = [word_counts[word] for word in vocabulary]
    return feature_vector

# Create a word count frequency feature for a corpus of documents
def word_count_corpus(corpus, vocabulary):
    X = [word_count(text, vocabulary) for text in corpus]
    return np.array(X)

def BoW_Name_Feature(X_train_all, X_test_all, X_train_counts, X_test_counts, min_df):

    corpus = X_train_all['name'].tolist()
    vocabulary = create_vocabulary(corpus, min_df)
    X = word_count_corpus(corpus, vocabulary)
    X_sparse = csr_matrix(X)
    X_train_counts = hstack((X_sparse, X_train_counts))

    corpus = X_test_all['name'].tolist()
    X = word_count_corpus(corpus, vocabulary)
    X_sparse = csr_matrix(X)
    X_test_counts = hstack((X_sparse, X_test_counts))

    return X_train_counts, X_test_counts

##############################################################

def FeatureSelection(X_train_counts, X_test_counts, nfeatures):

    df = pd.DataFrame(X_train_counts.toarray())
    means = df.mean(axis=0)
    sorted_cols = means.sort_values(ascending=False).index
    # Reorder columns based on sorted column order
    df_sorted = df[sorted_cols]
    df_sorted_selected_X_train = df_sorted.iloc[:,0:nfeatures]

    df = pd.DataFrame(X_test_counts.toarray())
    # Reorder columns based on sorted column order
    df_sorted = df[sorted_cols]
    df_sorted_selected_X_test = df_sorted.iloc[:,0:nfeatures]

    return df_sorted_selected_X_train, df_sorted_selected_X_test


###Cross Validation###
def CrossValidation(X_train_counts, X_test_counts, y_train):
    param_grid = {
        'alpha': [0.1, 1.0, 10.0]
    }
    # Perform k-fold cross-validation to tune hyperparameters
    k = 5 # number of folds for cross-validation
    naive_bayes = MultinomialNB()
    params, scores = [], []

    for i in range (5):
        grid_search = GridSearchCV(estimator=naive_bayes, param_grid=param_grid, cv=k)
        grid_search.fit(X_train_counts, y_train)

        # Get the best hyperparameters and their corresponding scores
        params.append(grid_search.best_params_)
        scores.append(grid_search.best_score_)

    best_score = max(scores)
    best_score_index = scores.index(best_score)
    best_params = params[best_score_index]

    print('Best Cross Validation Score: ', best_score)
    print('Best Hyperparameter(alpha) value: ', best_params)
    print('Approximation of Test Error: ', np.mean(scores))

    return best_params


###Training Naive Bayes Classifier###
def TrainNB(X_train_counts, X_test_counts, best_params, y_train):
    clf = MultinomialNB(**best_params)
    clf.fit(X_train_counts, y_train)
    y_pred = clf.predict(X_test_counts)
    return clf, y_pred 


###Reformatting for submission on Kaggle###
def ReformatKaggleTask1(y_pred, ID_test):
    data = {'ID': ID_test,
            'category': y_pred}
    labels_preds = pd.DataFrame(data = data)
    labels_preds.to_csv('submission_task1.csv', index = False)
    return labels_preds
###Reformatting for submission on Kaggle###
def ReformatKaggleTask2(y_pred, ID_test):
    data = {'ID': ID_test,
            'category': y_pred}
    labels_preds = pd.DataFrame(data = data)
    labels_preds.to_csv('submission_task2.csv', index = False)
    return labels_preds

Train-Test Split, Data Imports and Assignment Task Main Functions

In [128]:
###Import Datasets###
train = pd.read_csv("train.csv/train.csv")
test = pd.read_csv("test.csv")
major = pd.read_csv("major.csv")
##############################

###Train-test split###
X_train = train.drop('category', axis = 1)
X_test = test
y_train = train['category']
##############################

RunTask1 = False
RunTask2 = True

# Task 1
if (RunTask1 == True):
    X_train, X_test, ID_train, ID_test, X_train_all, X_test_all = ApplyPreprocessingSteps (X_train, X_test)
    X_train_counts, X_test_counts = BoW_Feature(X_train, X_test)
    best_params = CrossValidation(X_train_counts, X_test_counts, y_train)
    clf, y_pred = TrainNB(X_train_counts, X_test_counts, best_params, y_train)
    labels_preds = ReformatKaggleTask1(y_pred, ID_test)
    labels_preds1 = labels_preds.groupby('category').count()
    print(labels_preds1)


#Task 2
if (RunTask2 == True):
    X_train, X_test, ID_train, ID_test, X_train_all, X_test_all = ApplyPreprocessingSteps (X_train, X_test)
    X_train_counts, X_test_counts = BoW_Feature(X_train, X_test)
    X_train_counts, X_test_counts = BoW_Name_Feature(X_train_all, X_test_all, X_train_counts, X_test_counts, min_df=0.005)
    # X_train_counts, X_test_counts = Bigrams_Feature(X_train, X_test)
    # X_train_counts = FeatureSelection(X_train_counts)
    X_train_counts, X_test_counts = FeatureSelection(X_train_counts, X_test_counts, nfeatures = 10663)
    best_params = CrossValidation(X_train_counts, X_test_counts, y_train)
    clf, y_pred = TrainNB(X_train_counts, X_test_counts, best_params, y_train)
    labels_preds = ReformatKaggleTask2(y_pred, ID_test)
    labels_preds2 = labels_preds.groupby('category').count()
    print(labels_preds2)

Best Cross Validation Score:  0.8865389870084706
Best Hyperparameter(alpha) value:  {'alpha': 0.1}
Approximation of Test Error:  0.8865389870084706
              ID
category        
Nightlife     77
Restaurants  451
Shopping     200


In [94]:
X_test_counts.shape

(728, 10663)

In [95]:
a = X_train_counts.tocsc()
a.shape

(2838, 10663)

In [96]:
X_train_counts.shape

(2838, 10663)

In [80]:
# means = np.array(X_train_counts.mean(axis=1)).flatten() # Calculate mean of each row
# sorted_indices = np.argsort(means)[::-1] # Get the indices of rows sorted in descending order of means
# sorted_csr_mat = X_train_counts[sorted_indices]
# # Print the top 10 rows with highest means
# for i in range(10):
#     row_index = sorted_indices[i]
#     mean_value = means[row_index]
#     print(f"Row {row_index}: Mean = {mean_value}")

# # top_k_values, top_k_indices = sorted_csr_mat.topk(k=100, axis=1, sorted=True)

# Compute column means
column_means = np.array(X_train_counts.mean(axis=0)).squeeze()
# Get indices of the top 100 columns with highest means
top_indices = np.argsort(column_means)[-100:]
# Select only the top 100 columns
sparse_matrix_top = X_train_counts[:, top_indices]

In [117]:

df = pd.DataFrame(X_train_counts.toarray())
means = df.mean(axis=0)
sorted_cols = means.sort_values(ascending=False).index

# Reorder columns based on sorted column order
df_sorted = df[sorted_cols]

df_sorted_selected = df_sorted.iloc[:,0:250]

In [118]:
np.mean(hehe.iloc[:,249])

0.03770260747004933