# IMPORT PACKAGES

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import scipy
from scipy.linalg import logm,expm
from scipy.sparse import csr_matrix
from sklearn.naive_bayes import MultinomialNB

#for logistic regression
from sklearn import linear_model
from scipy.special import expit
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# import statistics
# from scipy import stats

# import itertools

%matplotlib inline
import matplotlib.pyplot as plt


from IPython.core.debugger import set_trace         #for debugging 

np.random.seed(123) 

# Naive Bayes Classifier

##**Multinomial Naive Bayes Class**


In [None]:
class MultinomialNaiveBayes:
    
    def __init__(self):
        return
    
    def fit(self, x, y, alpha):
        N, D = x.shape
        C = np.max(y) + 1
        Nc = np.zeros(C)                              # number of instances in class c
        count_word= np.zeros((C,D))

        for c in range(C):
            x_c = x[y == c]                           # slice all the elements from class c
            Nc[c] = x_c.shape[0]                      # get number of elements of class c [N(y=c)]
            count_word[c,:] = np.sum(x_c,axis=0)      # count number of times the word appears in class C


        self.pi = (Nc+alpha)/(N+C)                        # Laplace smoothing (using alpha_c=1 for all c) you can derive using Dirichlet's distribution
        #self.pi = csr_matrix(self.pi)     #Turn pi to sparse matrix
        self.theta_one = count_word+alpha     
        #self.theta_one = csr_matrix(self.theta_one) #
        self.theta_two = alpha*D+C
        #self.theta_two = csr_matrix(self.theta_two) #
        #self.theta = (count_word+1)/(D+C)
        #print(self.theta_two.todense())
        #print(np.log(self.theta_one[:,None,:])- np.log(self.theta_two))
        return self

In [None]:
def logsumexp(Z):                                                # dimension C x N
    Zmax = np.max(Z,axis=0)[None,:]                              # max over C
    log_sum_exp = Zmax + np.log(np.sum(np.exp(Z - Zmax), axis=0))
    return log_sum_exp

def predict(self, xt):
    Nt, D = xt.shape
    # for numerical stability we work in the log domain
    # we add a dimension because this is added to the log-likelihood matrix 
    # that assigns a likelihood for each class (C) to each test point, and so it is C x N
    log_prior = np.log(self.pi)[:, None]

    # logarithm of the likelihood term for Multinomial 
    ##### log_likelihood = xt[None,:,:]*(np.log(self.theta_one[:,None,:]) - np.log(self.theta_two)) # C x N x D
    C,D_theta = self.theta_one.shape
    xx_test = csr_matrix(xt)
    log_likelihood = np.zeros((Nt,C))
    log_theta = np.log(self.theta_one) - np.log(self.theta_two)
    for i in range(C):
        mult = xt.multiply(csr_matrix(log_theta[i]))
        mult = mult.sum(axis=1)
        log_likelihood[:,i] = mult.reshape(mult.shape[0],)[0] 
        del mult
        
    log_likelihood = log_likelihood.T

    # now we sum over the feature dimension to get a C x N matrix (this has the log-likelihood for each class-test point combination)
    #log_likelihood = np.sum(log_likelihood, axis=2)

    # posterior calculation
    log_posterior = log_prior + log_likelihood
    posterior = np.exp(log_posterior - logsumexp(log_posterior))
    
    return posterior.T                                               # dimension N x C

MultinomialNaiveBayes.predict = predict

#Logistic Regression Classifier

In [None]:
# Fit the classifier
clf = linear_model.LogisticRegression()
"""clf.fit(X_newsgroup_train_tfidf, Y_train)

predictions = clf.predict(X_newsgroup_test_tfidf)
print(X_newsgroup_test_tfidf.shape, Y_test.shape)
score = clf.score(X_newsgroup_test_tfidf, Y_test)  

print(score)"""



#Data Pre-Process & Split Function

In [None]:
def cross_validation_split(data): # according to the assignment all that we take is the data as entry
  imdb_vectorizer = CountVectorizer()
  X_all_counts = imdb_vectorizer.fit_transform(data.data)
  tfidf_transformer = TfidfTransformer()
  X_train = X_all_counts[:15000,:]  # change the train-test split here
  X_test = X_all_counts[15000:,:]  # change the train-test split here
  X_train_tfidf = tfidf_transformer.fit_transform(X_train)
  X_test_tfidf = tfidf_transformer.fit_transform(X_test)
  Y_train = data.target[:15000] # change the train-test split here
  Y_test = data.target[15000:] # change the train-test split here
  num_folds = 10
  (num_instances, num_features), num_classes = X_train.shape, np.max(Y_train)+1

  n_test = num_instances
  n_valid = len(data.data) - num_instances 
  inds_1 = np.random.permutation(n_test)
  inds_2 = np.random.permutation(n_valid)

  x_test, y_test = X_test_tfidf[inds_2[:n_valid],:], Y_test[inds_2[:n_valid]]
  x_rest, y_rest = X_train_tfidf[inds_1[:n_test],:], Y_train[inds_1[:n_test]]
  return x_test, y_test, x_rest, y_rest


# Cross-Validation Methods


In [None]:
def evaluate_acc(pred, label):
  accuracy = np.sum(y_pred == y_rest)/y_pred.shape[0]
  return accuracy

In [None]:
def cross_validate(n, n_folds=5):
    #get the number of data samples in each split
    n_val = n // n_folds
    inds = np.random.permutation(n)
    inds = []
    for f in range(n_folds):
        tr_inds = []
        #get the validation indexes
        val_inds = list(range(f * n_val, (f+1)*n_val))
        #get the train indexes
        if f > 0:
            tr_inds = list(range(f*n_val))
        if f < n_folds - 1:
            tr_inds = tr_inds + list(range((f+1)*n_val, n))
        #The yield statement suspends function’s execution and sends a value back to the caller
        #but retains enough state information to enable function to resume where it is left off
        yield tr_inds, val_inds

In [None]:
"""Before using, make sure you have switched the is_naive_bayes boolean on or off. Also make sure you have
chosen the correct test split in the cross-validation split data function"""

def kfoldCV(x_test, y_test, x_rest, y_rest, model):
  is_naive_bayes = True #switch to false if the model is logistic regression
  num_folds = 5
  K_list = [0.0001, 0.001, 0.01,0.1,0.5,1]
  num_instances = x_rest.shape[0]
  err_test, err_valid = np.zeros(len(K_list)), np.zeros((len(K_list), num_folds))
  for i, K in enumerate(K_list):
      #Find the validation errors for num_folds splits for a given K
      for f, (tr, val) in enumerate(cross_validate(num_instances, num_folds)):
        md = model
        md = md.fit(x_rest[tr],y_rest[tr],K)
        y_prob = model.predict(x_rest[val])
        if is_naive_bayes:
          y_pred = np.argmax(y_prob, 1)
        else:
          y_pred = y_prob
        accuracy = np.sum(y_pred == y_rest[val])/y_pred.shape[0]
        err_valid[i, f] = 1-accuracy

      
      #this is the part that we don't do in a usual setup. We don't touch the test set until the very end. 
      md = model
      md.fit(x_rest, y_rest,K)
      y_prob = md.predict(x_test)
      if is_naive_bayes:
        y_pred = np.argmax(y_prob, 1)
      else:
        y_pred = y_prob
      accuracy = np.sum(y_pred == y_test)/y_pred.shape[0]
      #accuracy = evaluate_acc(y_pred, y_rest)
      
      err_test[i]= 1-accuracy
      
  plt.plot(K_list, err_test, label='test')
  plt.errorbar(K_list, np.mean(err_valid, axis=1), np.std(err_valid, axis=1), label='validation')
  plt.legend()
  plt.xlabel('C')
  plt.ylabel('error rate')
  plt.show()


# Dataset 1: 20 newsgroup

## Task 1: Acquiring and preprocessing

### IMPORT DATA

In [None]:
newsgroup_all = fetch_20newsgroups(subset='all' ,remove=('headers','footers','quotes'))
#newsgroup_all

## Task 2: Implementing Cross-Valitdation on our classifiers

### Naive Bayes Cross Validation

In [None]:
x_test, y_test, x_rest, y_rest = cross_validation_split(newsgroup_all)
#Remember to set the is_naive_bayes boolean to true or false! Remember to reset test train splits!
kfoldCV(x_test, y_test, x_rest, y_rest, MultinomialNaiveBayes())


###Logistic Regression Cross Validation


In [None]:
x_test, y_test, x_rest, y_rest = cross_validation_split(newsgroup_all)
#Remember to set the is_naive_bayes boolean to true or false! Remember to reset test train splits!
kfoldCV(x_test, y_test, x_rest, y_rest, linear_model.LogisticRegression())

###Train Best Model

In [None]:
best_md = MultinomialNaiveBayes()
best_md = best_md.fit(x_rest,y_rest,0.01)

y_prob = best_md.predict(x_test)
y_pred = np.argmax(y_prob, 1)

best_accuracy = np.sum(y_pred == y_test)/y_pred.shape[0]
best_accuracy

## Task 3: Experimentation

### Find optimal hyperparameter for Logistic Regression

In [None]:
# get # train instances (from percent)
all_vectorizer = CountVectorizer()
X_newgroups_all_counts = all_vectorizer.fit_transform(newsgroup_all.data)
num_train = round(50/100 * X_newgroups_all_counts.shape[0])

# get data
X_newgroups_train = X_newgroups_all_counts[:num_train,:]  # around 50% training
X_newgroups_test = X_newgroups_all_counts[num_train:,:]   # around 50% testing
news_tfidf_transformer = TfidfTransformer()
X_newsgroup_train_tfidf = news_tfidf_transformer.fit_transform(X_newgroups_train)
X_newsgroup_test_tfidf = news_tfidf_transformer.fit_transform(X_newgroups_test)
Y_train = newsgroup_all.target[:num_train]
Y_test = newsgroup_all.target[num_train:]

import warnings
with warnings.catch_warnings():
    warnings.simplefilter('ignore') #ignores warnings
    grid={"penalty": ["l1","l2", "elasticnet", "none"], "solver": ["sag", "saga"], "multi_class": ["multinomial"]}# l1 lasso l2 ridge
    logreg=LogisticRegression()
    logreg_cv=GridSearchCV(logreg,grid,cv=5)
    logreg_cv.fit(X_newsgroup_train_tfidf, Y_train)
    print("tuned hyperparameters :(best parameters) ",logreg_cv.best_params_)
    print("accuracy :",logreg_cv.best_score_)

### Different Train/Test splits

In [None]:
#try diff splits
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score 
import warnings 

def evaluate_acc(pred, label):
  accuracy = np.sum(pred == label)/y_pred.shape[0]
  return accuracy

with warnings.catch_warnings():
  warnings.simplefilter('ignore') #ignores warnings
  train_perc = [20, 40, 60, 80, 90, 95]
  NBlist = list()
  Bernlist = list()
  LRlist = list()
  LRoptlist = list()
  GDlist = list()
  SVClist = list()
  for perc in train_perc:
    # get # train instances (from percent)
    num_train = round(perc/100 * X_newgroups_all_counts.shape[0])

    # get data
    X_newgroups_train = X_newgroups_all_counts[:num_train,:]  # around 80% training
    X_newgroups_test = X_newgroups_all_counts[num_train:,:]   # around 20% testing
    news_tfidf_transformer = TfidfTransformer()
    X_newsgroup_train_tfidf = news_tfidf_transformer.fit_transform(X_newgroups_train)
    X_newsgroup_test_tfidf = news_tfidf_transformer.fit_transform(X_newgroups_test)
    Y_train = newsgroup_all.target[:num_train]
    Y_test = newsgroup_all.target[num_train:]

    # NB
    model = MultinomialNaiveBayes()
    model.fit(X_newsgroup_train_tfidf,Y_train, 1)
    y_prob = model.predict(X_newsgroup_test_tfidf)
    y_pred = np.argmax(y_prob, 1)
    acc =  evaluate_acc(y_pred, Y_test)
    print('NB ', perc, '=', acc)
    NBlist.append(acc)

    # LR standard
    lrmodel = LogisticRegression().fit(X_newsgroup_train_tfidf, Y_train)
    y_pred = lrmodel.predict(X_newsgroup_test_tfidf)
    acc =  evaluate_acc(y_pred, Y_test)
    print('LR ', perc, '=', acc)
    LRlist.append(acc)
    
    # LR
    lroptmodel = LogisticRegression(multi_class= 'multinomial', penalty= 'none', solver= 'saga')
    lroptmodel.fit(X_newsgroup_train_tfidf, Y_train)
    y_pred = lroptmodel.predict(X_newsgroup_test_tfidf)
    acc =  evaluate_acc(y_pred, Y_test)
    print('LR opt', perc, '=', acc)
    LRoptlist.append(acc)

    #gradient descendt
    gdmodel = SGDClassifier().fit(X_newsgroup_train_tfidf, Y_train)
    y_pred = gdmodel.predict(X_newsgroup_test_tfidf)
    acc =  evaluate_acc(y_pred, Y_test)
    print('GD ', perc, '=', acc)
    GDlist.append(acc)

    #SVC
    svcmodel = svm.LinearSVC().fit(X_newsgroup_train_tfidf, Y_train)
    y_pred = svcmodel.predict(X_newsgroup_test_tfidf)
    acc =  evaluate_acc(y_pred, Y_test)
    print('SVC ', perc, '=', acc)
    SVClist.append(acc)

  plt.plot(train_perc, NBlist, 'r-', label='Multinomial Naive Bayes',alpha=0.7)
  plt.plot(train_perc, LRlist, 'b-', label='Default Logistic Regression',alpha=0.7)
  plt.plot(train_perc, LRoptlist, 'k-', label='Optimized Logistic Regression',alpha=0.7)
  plt.plot(train_perc, GDlist, 'y-', label='Gradient Descent Classifier',alpha=0.7)
  plt.plot(train_perc, SVClist, 'g-', label='SVC',alpha=0.7)
  plt.xticks(np.arange(min(train_perc), max(train_perc)+1, 5.0))
  plt.xlabel('Percent Training Data')
  plt.ylabel('Accuracy')
  plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
  plt.show()

# Dataset 2: IMDB Reviews


## Task 1: Acquiring and preprocessing



### Import Data

Mounting drive

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Extract from Tar file

In [None]:
import tarfile
tf = tarfile.open("/content/drive/MyDrive/aclImdb_v1.tar")
tf.extractall()
print('done extracting')

Load data

In [None]:
testfolder = '/content/aclImdb/test/'
trainfolder = '/content/aclImdb/train/'

negData = list()
posData = list()
labels = list()

# get train first
# get pos data
for filename in os.listdir(trainfolder+'pos'):
  file = open(trainfolder+'pos/' + filename)
  posData.append(file.read())
  file.close()
# get neg data
for filename in os.listdir(trainfolder+'neg'):
  file = open(trainfolder+'neg/' + filename)
  negData.append(file.read())
  file.close()

trainData = negData + posData

# neg = 0, pos = 1
trainLabel = np.array([0]*len(negData) + [1]*len(posData))

# get test second
negData = list()
posData = list()
# get neg data
for filename in os.listdir(testfolder+'pos'):
  file = open(testfolder+'pos/' + filename)
  posData.append(file.read())
  file.close()
# get pos data
for filename in os.listdir(testfolder+'neg'):
  file = open(testfolder+'neg/' + filename)
  negData.append(file.read())
  file.close()

testData = negData + posData

# neg = 0, pos = 1
testLabel = np.array([0]*len(negData) + [1]*len(posData))

allData = trainData+testData
allLabel = np.concatenate((trainLabel, testLabel), axis=0)

print(len(trainData), len(testData)) 
print(len(allData))

Create bundle (so matches format of dataset 1)

In [None]:
class bundle:
  def __init__(self, data, target, target_names):
    self.data = data
    self.target = target
    self.target_names = target_names
    self.labels = None

#IMDB_train = bundle(trainData, trainLabel, ['neg', 'pos'])
#IMDB_test = bundle(testData, testLabel, ['neg', 'pos'])

IMDB_all = bundle(allData,allLabel,['neg','pos'])

# have 4 attributes: data, target (0, 1), target_names (neg and pos), and predicted labels

## Task 2: Implementing Cross-Valitdation on our classifiers

###Naive Bayes Cross Validation

In [None]:
x_test, y_test, x_rest, y_rest = cross_validation_split(IMDB_all)
#Remember to set the is_naive_bayes boolean to true or false! Remember to reset test train splits!
kfoldCV(x_test, y_test, x_rest, y_rest, MultinomialNaiveBayes())


### Logistic Regression Cross-Validation

In [None]:
x_test, y_test, x_rest, y_rest = cross_validation_split(IMDB_all)
kfoldCV(x_test, y_test, x_rest, y_rest, linear_model.LogisticRegression())

**Train Best Model**

In [None]:
best_md2 = MultinomialNaiveBayes()
best_md2 = best_md2.fit(x_rest,y_rest,0.1)
y_prob2 = best_md2.predict(x_test)
y_pred2 = np.argmax(y_prob2, 1)

accuracy2 = np.sum(y_pred2 == y_test)/y_pred2.shape[0]
accuracy2

## Task 3: Experimentation

###Find optimal hyperparameter for Logistic Regression

In [None]:
import warnings

N = X_allData.shape[0]
num_train = round(50/100 * N) # 50-50 split

inds = np.random.permutation(N) #random selection (otherwise, sometimes only get one class)

X_IMDB_train = X_allData[inds[:num_train],:]  
X_IMDB_test = X_allData[inds[num_train:],:]   
IMDB_tfidf_transformer = TfidfTransformer()
X_IMDB_train_tfidf = IMDB_tfidf_transformer.fit_transform(X_IMDB_train)
X_IMDB_test_tfidf = IMDB_tfidf_transformer.fit_transform(X_IMDB_test)
Y_train = IMDB_all.target[inds[:num_train]]
Y_test = IMDB_all.target[inds[num_train:]]


with warnings.catch_warnings():
    warnings.simplefilter('ignore') #ignores warnings
    grid={"penalty":["l1","l2", "elasticnet", "none"], "solver": ["sag", "saga"], "multi_class": ["bernoulli", "multinomial"]}# l1 lasso l2 ridge
    logreg=LogisticRegression()
    logreg_cv=GridSearchCV(logreg,grid,cv=5)
    logreg_cv.fit(X_IMDB_train_tfidf, Y_train)
    print("tuned hyperparameters :(best parameters) ",logreg_cv.best_params_)
    print("accuracy :",logreg_cv.best_score_)

###Different Train/Test split

In [None]:
###Different Train/Test split
#try diff splits
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import BernoulliNB
import warnings

def evaluate_acc(pred, label):
  accuracy = np.sum(pred == label)/y_pred.shape[0]
  return accuracy

with warnings.catch_warnings():
  warnings.simplefilter('ignore') #ignores warnings
  train_perc = [20, 40, 60, 80, 90, 95]
  NBlist = list()
  Bernlist = list()
  LRoptlist = list()
  LRlist = list()
  GDlist = list()
  SVClist = list()
  for perc in train_perc:
    # get # train instances (from percent)
    N = X_allData.shape[0]
    num_train = round(perc/100 * N)

    inds = np.random.permutation(N) #random selection (otherwise, sometimes only get one class)

    X_IMDB_train = X_allData[inds[:num_train],:]  # around 80% training
    X_IMDB_test = X_allData[inds[num_train:],:]   # around 20% testing
    IMDB_tfidf_transformer = TfidfTransformer()
    X_IMDB_train_tfidf = IMDB_tfidf_transformer.fit_transform(X_IMDB_train)
    X_IMDB_test_tfidf = IMDB_tfidf_transformer.fit_transform(X_IMDB_test)
    Y_train = IMDB_all.target[inds[:num_train]]
    Y_test = IMDB_all.target[inds[num_train:]]

    # NB
    model = MultinomialNaiveBayes()
    model.fit(X_IMDB_train_tfidf,Y_train, 1)
    y_prob = model.predict(X_IMDB_test_tfidf)
    y_pred = np.argmax(y_prob, 1)
    acc =  evaluate_acc(y_pred, Y_test)
    print('NB ', perc, '=', acc)
    NBlist.append(acc)

    # NB
    bern = BernoulliNB()
    bern.fit(X_IMDB_train_tfidf,Y_train)
    y_pred = bern.predict(X_IMDB_test_tfidf)
    acc =  evaluate_acc(y_pred, Y_test)
    print('Bernoulli ', perc, '=', acc)
    Bernlist.append(acc)

    # LR standard
    lrmodel = LogisticRegression().fit(X_IMDB_train_tfidf, Y_train)
    y_pred = lrmodel.predict(X_IMDB_test_tfidf)
    acc =  evaluate_acc(y_pred, Y_test)
    print('LR ', perc, '=', acc)
    LRlist.append(acc)

    #LR optimized
    lroptmodel = LogisticRegression(multi_class='multinomial', penalty='l2', solver='sag')
    lroptmodel.fit(X_IMDB_train_tfidf, Y_train)
    y_pred = lroptmodel.predict(X_IMDB_test_tfidf)
    acc =  evaluate_acc(y_pred, Y_test)
    print('LR opt', perc, '=', acc)
    LRoptlist.append(acc)
    
    #gradient descendt
    gdmodel = SGDClassifier().fit(X_IMDB_train_tfidf, Y_train)
    y_pred = gdmodel.predict(X_IMDB_test_tfidf)
    acc =  evaluate_acc(y_pred, Y_test)
    print('GD ', perc, '=', acc)
    GDlist.append(acc)

    #SVC
    svcmodel = svm.LinearSVC().fit(X_IMDB_train_tfidf, Y_train)
    y_pred = svcmodel.predict(X_IMDB_test_tfidf)
    acc =  evaluate_acc(y_pred, Y_test)
    print('SVC ', perc, '=', acc)
    SVClist.append(acc)

  plt.plot(train_perc, NBlist, 'r-', label='Multinomial Naive Bayes',alpha=0.7)
  plt.plot(train_perc, Bernlist, 'p-', label='Bernoulli',alpha=0.7)
  plt.plot(train_perc, LRoptlist, 'k-', label='Optimized Logistic Regression',alpha=0.7)
  plt.plot(train_perc, LRlist, 'b-', label='Default Logistic Regression',alpha=0.7)
  plt.plot(train_perc, GDlist, 'y-', label='Gradient Descent Classifier',alpha=0.7)
  plt.plot(train_perc, SVClist, 'g-', label='SVC',alpha=0.7)
  plt.xticks(np.arange(min(train_perc), max(train_perc)+1, 5.0))
  plt.xlabel('Percent Training Data')
  plt.ylabel('Accuracy')
  plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
  plt.show()