In [None]:
import pandas as pd
import numpy as np

from sklearn.decomposition import PCA as sklearnPCA
from sklearn.naive_bayes import BernoulliNB

In [3]:
#Read the input files and read every line
def loadData(trainingFile, testingFile):
    
    def convertDataframe(inputFile):
        data = pd.DataFrame(columns=range(100000))
        
        for i in range(len(inputFile)):
            record = np.fromstring(inputFile[i], dtype=int, sep=' ')
            record_bool = [0 for j in range(100000)]
            for col in record:
                record_bool[col-1] = 1
            
            data.loc[i] = record_bool
            
        return data
    
    with open(trainingFile, "r") as fr1:
        trainFile = fr1.readlines()
    
    #Split each line in the two files into label and data  
    train_data_list = []
    train_labels_list = []
    
    for inputData in trainFile:
        train_labels_list.append(inputData[0])
        
        #Remove the activity label (0/1) and new line character from each record
        inputData = inputData.replace("0\t", "")
        inputData = inputData.replace("1\t", "")
        inputData = inputData.replace("\n", "")
        train_data_list.append(inputData)
    
    train_labels = np.asarray(train_labels_list)
    train_data = convertDataframe(train_data_list)
        
    with open(testingFile, "r") as fr2:
        testFile = fr2.readlines()
    
    test_data = convertDataframe(testFile)
            
    return train_data, test_data, train_labels

# Project data on a reduced dimensionality k using PCA
def pca(train_data, test_data, k):

    pca = sklearnPCA(n_components = k)
    PCA_projected_trainData = pca.fit_transform(train_data)
    PCA_projected_testData = pca.transform(test_data)
    
    return PCA_projected_trainData, PCA_projected_testData

#Perform Bernoulli's Naive Bayes Classification
def classifier(PCA_projected_trainData, PCA_projected_testData, train_labels ):

    BNBC = BernoulliNB()
    BNBC.fit(PCA_projected_trainData, train_labels)

    predictions = []

    predictions = BNBC.predict(PCA_projected_testData)

    return predictions

In [4]:
#Read the training and the test data set and get 3 separate dataframes of training reviews, test reviews and training labels
train_data, test_data, train_labels = loadData('train.dat', 'test.dat')

In [5]:
train_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,99990,99991,99992,99993,99994,99995,99996,99997,99998,99999
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [6]:
#Reduce the number of dimensions from 100000 to 100 using PCA
PCA_projected_trainData, PCA_projected_testData = pca(train_data, test_data, 500)

In [7]:
#Classify data using Naive Bayes Classifier
predictions = classifier(PCA_projected_trainData, PCA_projected_testData, train_labels )

In [None]:
#Write the result to a .dat file
output = open('output-k-100-PCA-BNBC.dat', 'w')

output.writelines( "%s\n" % prediction for prediction in predictions )

output.close()