In [1]:
import numpy as np
from numpy.linalg import inv
import csv
import math
import re
import matplotlib.pyplot as plt
from collections import Counter
from sklearn import tree
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import f1_score

In [2]:
#Importing Data from Files
def import_dataset(file):
    raw_train, raw_valid, raw_test = [],[], []
    Y_train, Y_valid, Y_test = [],[],[]
    
    train_str = "./Datasets/" + str(file) + "-train.txt"
    valid_str = "./Datasets/" + str(file) + "-valid.txt"
    test_str = "./Datasets/" + str(file) + "-test.txt"
    
    raw_train = [line.lower().rstrip('\t') for line in open(train_str)]
    raw_train = np.array(raw_train)
    Y_train = np.zeros((len(raw_train)), dtype='int')
    for i in range(len(raw_train)):
        Y_train[i] = raw_train[i][-2:]
        raw_train[i] = re.sub('[^A-Za-z\s]+','',str(raw_train[i]))
        raw_train[i] = raw_train[i][:-2]
    
    raw_valid = [line.lower().rstrip('\t') for line in open(valid_str)]
    raw_valid = np.array(raw_valid)
    Y_valid = np.zeros((len(raw_valid)), dtype='int')
    for i in range(len(raw_valid)):
        Y_valid[i] = raw_valid[i][-2:]
        raw_valid[i] = re.sub('[^A-Za-z\s]+','',str(raw_valid[i]))
        raw_valid[i] = raw_valid[i][:-2]

    raw_test = [line.lower().rstrip('\t') for line in open(test_str)]
    raw_test = np.array(raw_test)
    Y_test = np.zeros((len(raw_test)), dtype='int')
    for i in range(len(raw_test)):
        Y_test[i] = raw_test[i][-2:]
        raw_test[i] = re.sub('[^A-Za-z\s]+','',str(raw_test[i]))
        raw_test[i] = raw_test[i][:-2]
        
    return raw_train, raw_valid, raw_test, Y_train, Y_valid, Y_test

In [3]:
#Importing Data from files created by Bag of Words or Frequency Bag of Words
def import_created_dataset(file):
    X_train, X_valid, X_test = [],[], []
    Y_train, Y_valid, Y_test = [],[],[]
    
    train_str = "./Created_Datasets/" + str(file) + "_X_train.csv"
    valid_str = "./Created_Datasets/" + str(file) + "_X_valid.csv"
    test_str = "./Created_Datasets/" + str(file) + "_X_test.csv"
    
    label_train_str =  "./Created_Datasets/" + str(file) + "_Y_train.csv"
    label_valid_str = "./Created_Datasets/" + str(file) + "_Y_valid.csv"
    label_test_str = "./Created_Datasets/" + str(file) + "_Y_test.csv"
    
    with open(train_str, 'rb') as csv_train_file:
        train_data = csv.reader(csv_train_file, delimiter = ',')
        for row in train_data:
            X_train.append(row)
        del(row)
    
    with open(valid_str, 'rb') as csv_valid_file:
        valid_data = csv.reader(csv_valid_file, delimiter = ',')
        for row in valid_data:
            X_valid.append(row)
        del(row)
        
    with open(test_str, 'rb') as csv_test_file:
        test_data = csv.reader(csv_test_file, delimiter = ',')
        for row in test_data:
            X_test.append(row)
        del(row)
    
    with open(label_train_str, 'rb') as label_csv_train_file:
        label_train_data = csv.reader(label_csv_train_file, delimiter = ',')
        for row in label_train_data:
            Y_train.append(row)
        del(row)
    
    with open(label_valid_str, 'rb') as label_csv_valid_file:
        label_valid_data = csv.reader(label_csv_valid_file, delimiter = ',')
        for row in label_valid_data:
            Y_valid.append(row)
        del(row)
        
    with open(label_test_str, 'rb') as label_csv_test_file:
        label_test_data = csv.reader(label_csv_test_file, delimiter = ',')
        for row in label_test_data:
            Y_test.append(row)
        del(row)
        
    return np.array(X_train), np.array(X_valid), np.array(X_test), np.array(Y_train), np.array(Y_valid), np.array(Y_test)

In [4]:
def export_vocab_file(dataset, file_name):
    list_temp = []
    MF_words = []
    for row in dataset:
        list_temp.extend(row.split(" "))
    MF_tuple = Counter(list_temp).most_common(10001)
    MF_words = [word for word, word_count in MF_tuple if word is not '']
    MF_count = [word_count for word, word_count in MF_tuple if word is not '']
    
    data_str = []
    for i in range(len(MF_words)):
        count_str = str(MF_words[i]) + " \t" + str(i) + "\t" + str(MF_count[i])
        data_str.append(count_str)
    data_str = np.array(data_str, dtype = str)
    np.savetxt(file_name, data_str, fmt="%s")

In [5]:
# Method to find the 10,000 most frequent words
def get_most_frequent_words(dataset):
    list_temp = []
    MF_words = []
    for row in dataset:
        list_temp.extend(row.split(" "))
    MF_tuple = Counter(list_temp).most_common(10001)
    MF_words = [word for word, word_count in MF_tuple if word is not '']
    MF_count = [word_count for word, word_count in MF_tuple if word is not '']
    print("Returning 10,000 most frequent")
    return (MF_words)

In [6]:
def get_vocab(dataset):
    for review in dataset:
        word_counter = Counter()
    for review in dataset:
        word_counter.update(review.split())
    
    v = word_counter.most_common(10000)
    words=np.zeros((10000,2),dtype="S100")

    for w in range(len(v)):
        words[w][0] = v[w][0]
        words[w,1] = w

    vocabulary = dict(words)
    return vocabulary

In [7]:
def generate_dataset_review_indexes(dataset, file_name, vocab, y_values):
    output_str = []
    for sentence in range(len(dataset)):
        string = ""
        for index, word in enumerate(dataset[sentence].split()):
            try:
                word_id = vocab[word]
                string = string + str(word_id) + " "
            except:
                pass
            
        string = string + "\t" + str(y_values[sentence]) 
        output_str.append(string)
    
    np.savetxt(file_name, np.array(output_str), fmt="%s")

In [8]:
def binary_bag_of_words(dataset, size, vocab_size):  
    fbow = frequency_bag_of_words(dataset, size, vocab_size)
    bbow = np.copy(fbow)
    bbow[bbow!=0] = 1
    print("Returning BBOW")
    return bbow

In [9]:
def frequency_bag_of_words(dataset, size, vocab_size):
    fbow = np.zeros([size, vocab_size])
    for index, review in enumerate(dataset):
        word, frequency = np.unique(np.array(review[:-4].split(), dtype=int), return_counts=True)
        for column, count in zip(word,frequency):
            fbow[index][column-1] = count
    return fbow

In [10]:
#Writing Dataset to csv_file
def write_dataset_to_file(dataset, string):
    file_name_str = "./Created_Datasets/" + str(string) + ".txt"
    np.savetxt(file_name_str, dataset)

In [11]:
#Creating random assignment predictions for YELP
def random_assignment_yelp(label_list):
    labels = np.random.randint(5, size=len(label_list))
    return np.array(labels, dtype= float)

In [23]:
#Creating random assignment predictions for IMDB
def random_assignment_imdb(label_list):
    labels = np.random.randint(2, size=len(label_list))
    return np.array(labels, dtype= float)

In [13]:
#Creating majority class prediction for YELP
def majority_assignment_yelp(label_list):
    most_common = Counter(label_list).most_common(1)
    most_common_class = most_common[0]
    print(most_common_class[0])
    return np.array(np.full(len(label_list), most_common_class[0]), dtype= float)

In [None]:
###########################################################################################################################

In [None]:
raw_train, raw_valid, raw_test, Y_train, Y_valid, Y_test = import_dataset('yelp')
raw_train = np.array(raw_train)
raw_valid = np.array(raw_valid)
raw_test = np.array(raw_test)

In [None]:
####### EXPORTING VOCABULARY FILE FOR YELP ###############
export_vocab_file(np.append(raw_train, np.append(raw_valid , raw_test)), "./Vocab/yelp-vocab.txt")
export_vocab_file(raw_train, "./Vocab/yelp-train-vocab.txt")
export_vocab_file(raw_valid, "./Vocab/yelp-valid-vocab.txt")
export_vocab_file(raw_test, "./Vocab/yelp-test-vocab.txt")

In [None]:
######### Exporting review as Indexes FOR YELP #####################
generate_dataset_review_indexes(raw_train, "./Generated_datasets/yelp-train.txt", get_vocab(raw_train), Y_train)
generate_dataset_review_indexes(raw_valid, "./Generated_datasets/yelp-valid.txt", get_vocab(raw_train), Y_valid)
generate_dataset_review_indexes(raw_test, "./Generated_datasets/yelp-test.txt", get_vocab(raw_train), Y_test)

In [None]:
##################### GENERATING BINARY BAG OF WORDS FOR YELP ##############################
raw_train_file = open("./Generated_datasets/yelp-train.txt","r")
raw_train_data = raw_train_file.readlines()
X_train = frequency_bag_of_words(raw_train_data, len(raw_train), len(get_vocab(raw_train)))
X_train = np.array(X_train, dtype = 'float')

raw_valid_file = open("./Generated_datasets/yelp-valid.txt","r")
raw_valid_data = raw_valid_file.readlines()
X_valid = frequency_bag_of_words(raw_valid_data, len(raw_valid), len(get_vocab(raw_train)))
X_valid = np.array(X_valid, dtype = 'float')

raw_test_file = open("./Generated_datasets/yelp-test.txt","r")
raw_test_data = raw_test_file.readlines()
X_test = frequency_bag_of_words(raw_test_data, len(raw_test), len(get_vocab(raw_train)))
X_test = np.array(X_test, dtype = 'float')

Y_train = np.array(Y_train, dtype = float)
Y_valid = np.array(Y_valid, dtype = float)
Y_test = np.array(Y_test, dtype = float)

In [None]:
write_dataset_to_file(X_train, "yelp_X_train")
write_dataset_to_file(Y_train, "yelp_Y_train")
write_dataset_to_file(X_valid, "yelp_X_valid")
write_dataset_to_file(Y_valid, "yelp_Y_valid")
write_dataset_to_file(X_test, "yelp_X_test")
write_dataset_to_file(Y_test, "yelp_Y_test")

In [None]:
X_train, X_valid, X_test, Y_train, Y_valid, Y_test = import_created_dataset("yelp")

In [None]:
#############################################################################################################################

In [14]:
raw_train, raw_valid, raw_test, Y_train, Y_valid, Y_test = import_dataset('IMDB')
raw_train = np.array(raw_train)
raw_valid = np.array(raw_valid)
raw_test = np.array(raw_test)

In [15]:
####### EXPORTING VOCABULARY FILE FOR IMDB ###############
export_vocab_file(np.append(raw_train, np.append(raw_valid , raw_test)), "./Vocab/IMDB-vocab.txt")
export_vocab_file(raw_train, "./Vocab/IMDB-train-vocab.txt")
export_vocab_file(raw_valid, "./Vocab/IMDB-valid-vocab.txt")
export_vocab_file(raw_test, "./Vocab/IMDB-test-vocab.txt")

In [16]:
######### Exporting review as Indexes FOR IMDB #####################
generate_dataset_review_indexes(raw_train, "./Generated_datasets/IMDB-train.txt", get_vocab(raw_train), Y_train)
generate_dataset_review_indexes(raw_valid, "./Generated_datasets/IMDB-valid.txt", get_vocab(raw_train), Y_valid)
generate_dataset_review_indexes(raw_test, "./Generated_datasets/IMDB-test.txt", get_vocab(raw_train), Y_test)

In [17]:
##################### GENERATING FREQUENCY BAG OF WORDS FOR IMDB ##############################
raw_train_file = open("./Generated_datasets/IMDB-train.txt","r")
raw_train_data = raw_train_file.readlines()
X_train = binary_bag_of_words(raw_train_data, len(raw_train), len(get_vocab(raw_train)))
X_train = np.array(X_train, dtype = 'float')

raw_valid_file = open("./Generated_datasets/IMDB-valid.txt","r")
raw_valid_data = raw_valid_file.readlines()
X_valid = binary_bag_of_words(raw_valid_data, len(raw_valid), len(get_vocab(raw_train)))
X_valid = np.array(X_valid, dtype = 'float')

raw_test_file = open("./Generated_datasets/IMDB-test.txt","r")
raw_test_data = raw_test_file.readlines()
X_test = binary_bag_of_words(raw_test_data, len(raw_test), len(get_vocab(raw_train)))
X_test = np.array(X_test, dtype = 'float')

Y_train = np.array(Y_train, dtype = float)
Y_valid = np.array(Y_valid, dtype = float)
Y_test = np.array(Y_test, dtype = float)

Returning BBOW
Returning BBOW
Returning BBOW


In [None]:
write_dataset_to_file(X_train, "IMDB_X_train")
write_dataset_to_file(Y_train, "IMDB_Y_train")
write_dataset_to_file(X_valid, "IMDB_X_valid")
write_dataset_to_file(Y_valid, "IMDB_Y_valid")
write_dataset_to_file(X_test, "IMDB_X_test")
write_dataset_to_file(Y_test, "IMDB_Y_test")

In [None]:
X_train, X_valid, X_test, Y_train, Y_valid, Y_test = import_created_dataset("IMDB")

In [None]:
###############################################################################################################################

In [None]:
# Decision Tree Classifier
for i in range(1,10):
    tree_classifier = tree.DecisionTreeClassifier(max_depth= 100+ i*10, min_samples_leaf= 8)
    tree_classifier = tree_classifier.fit(X_train, Y_train)
    tree_pred = tree_classifier.predict(X_valid)
    #tree_score = f1_score(Y_valid, tree_pred, average= 'micro', labels= [1,2,3,4,5])
    tree_score = f1_score(Y_valid, tree_pred)
    score_str = "Min Samples Leaf = " + str(100+ i*10) + " F-Score: " + str(tree_score)
    print(score_str)

In [None]:
# SVM Classifier
for i in range(1,10):
    svm_classifier = svm.SVC(C = 1 * i)
    svm_classifier = svm_classifier.fit(X_train, Y_train)
    svm_pred = svm_classifier.predict(X_valid)
    #svm_score = f1_score(Y_valid, svm_pred, average= 'micro', labels= [1,2,3,4,5])
    svm_score = f1_score(Y_valid, svm_pred)
    score_str = "C = " + str(i) + " F-Score: " + str(svm_score)
    print(score_str)

In [None]:
# Naive Bayes classifier (Bernoulli)
for i in range(0,10):
    nb_classifier = BernoulliNB(alpha= i)
    nb_classifier = nb_classifier.fit(X_train, Y_train)
    nb_pred = nb_classifier.predict(X_valid)
    #nb_score = f1_score(Y_valid, nb_pred, average= 'micro', labels= [1,2,3,4,5])
    nb_score = f1_score(Y_valid, nb_pred)
    score_str = "Alpha: " + str(i*10) + " F-Score: " + str(nb_score)
    print(score_str)

In [None]:
# Naive Bayes classifier (Gaussian)
nb_classifier = GaussianNB()
nb_classifier = nb_classifier.fit(X_train, Y_train)
nb_pred = nb_classifier.predict(X_valid)
nb_score = f1_score(Y_valid, nb_pred)#, average= 'micro', labels= [1,2,3,4,5])
print(nb_score)

In [27]:
print(f1_score(Y_train, random_assignment_imdb(Y_train)))
print(f1_score(Y_valid, random_assignment_imdb(Y_valid)))
print(f1_score(Y_test, random_assignment_imdb(Y_test)))

0.50260173449
0.500747831289
0.501238118061
