In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("IMDB Dataset.csv",delimiter = ",")
df = df.sample(frac=1).reset_index(drop=True)

In [3]:
### Some potential improvements: replace punctuation with an extra space, to split hyphenated words
### or e.g. hot/cold into two words. Shouldn't be too important.

def formatStrings(df):
    newdf = df.copy()
    for index, string in enumerate(df["review"]): 
        tempString = string.replace('<br /><br />', ' ')
        tempString = ''.join(c for c in tempString if c.isalpha() or (c == " ") or (c == "'"))
        tempString = tempString.lower()
        newdf["review"].iloc[index] = tempString
    return newdf

def splitStrings(df):
    newdf = df.copy()
    for index, string in enumerate(df["review"]): 
        newdf["review"].iloc[index] = string.split(" ")
        
    return newdf

In [4]:
formattedDF = formatStrings(df)
splitDF = splitStrings(formattedDF)

In [5]:
train = splitDF.sample(frac=0.8) 
test = splitDF.drop(train.index)

In [6]:
def createWordTotalsDct(df):
    newdf = df.copy()
    wordTotalsDct = {}
    for index, stringList in enumerate(df["review"]): 
        for string in stringList:
            try:
                wordTotalsDct[string] +=1
            except KeyError:
                wordTotalsDct.update({string:1})
                
    return wordTotalsDct

wordTotalsDct = createWordTotalsDct(train)

In [7]:
def genTheta(df, dct):
    
    keys = dct.keys()
    
    thetaDct = {key:[0,0] for key in keys}
    thetaNumeratorDct = {key:[0,0] for key in keys}
    
    d = len(dct)
    dctList = list(dct)
    c = 2

    thetaDenominatorSum = np.zeros((c))
    
    for index, row in df.iterrows(): 
        sentiment = row["sentiment"]
        stringArray = row["review"]
        
        if sentiment == "positive":
            y = 1
        else:
            y = 0
        
        thetaDenominatorSum[y] += len(stringArray)
        
        alpha = 0
        
        for string in stringArray:
            if string in keys:
                thetaNumeratorDct[string][y] += 1
                
    for key in keys:
        for y in range(0,c):
            thetaDct[key][y] = thetaNumeratorDct[key][y]/thetaDenominatorSum[y] # no smoothing
            
    return thetaDct
        

In [9]:
def getFeatureDct(row, thetaDct):

    keys = thetaDct.keys()
    
    featureDct = {key:0 for key in keys}
    stringArray = row["review"]
    
    for string in stringArray:
        if string in keys:
            featureDct[string] += 1
        
    return featureDct

def getMaxProb(featureDct, logThetaDct):
    
    logProbSums = [0,0]
    for key, x_alpha in featureDct.items():
        for y in range(0,c):     
            logProbSums[y] += x_alpha * logThetaDct[key][y]
    
    argMaxProb = np.argmax(logProbSums)
    
    return argMaxProb

def testNaiveBayes(test, thetaDct):
    
    correct = 0
    incorrect = 0
    
    correctIndices = []
    incorrectIndices = []
    
    logThetaDct = {key:np.log(value) for key, value in thetaDct.items()}
    
    for index, row in test.iterrows():
        
        stringSentiment = row["sentiment"]
        
        if stringSentiment == "positive":
            trueSentiment = 1
        else:
            trueSentiment = 0
        
        featureDct = getFeatureDct(row, thetaDct)
        predictedSentiment = getMaxProb(featureDct, logThetaDct)
        
        if trueSentiment == predictedSentiment:
            correct += 1
            correctIndices.append(index)
        else:
            incorrect += 1
            incorrectIndices.append(index)
    
    percScore = 100 * correct / (correct + incorrect)
    
    return percScore, correctIndices, incorrectIndices



### Naive Bayes - taking d most frequently occuring words 

In [11]:
c = 2
dList = [200, 500, 1000, 2000, 3000, 5000]

scoresList = []
correctIndicesList = []
incorrectIndicesList = []

dct = {key: wordTotalsDct[key] for key in sorted(wordTotalsDct, key=wordTotalsDct.get, reverse=True)[:max(dList)]}
thetaDct = genTheta(train, dct)

print("Finished calculating theta!")

for d in dList:
    
    truncThetaDct = {key: thetaDct[key] for key in sorted(wordTotalsDct, key=wordTotalsDct.get, reverse=True)[:d]}
    percScore, correctIndices, incorrectIndices = testNaiveBayes(test, truncThetaDct)
    
    scoresList.append(percScore)
    correctIndicesList.append(correctIndices)
    incorrectIndicesList.append(incorrectIndices)
    
    print("Bag-of-words dimension d = {} scored {}%".format(d, percScore))

Finished calculating theta!
Bag-of-words dimension d = 200 scored 67.0%
Bag-of-words dimension d = 500 scored 73.62%
Bag-of-words dimension d = 1000 scored 76.87%
Bag-of-words dimension d = 2000 scored 80.09%
Bag-of-words dimension d = 3000 scored 81.14%
Bag-of-words dimension d = 5000 scored 82.16%


### Naive Bayes - taking d "best" words

In [12]:
dList = [50,100,150,200,250,500]


def featureOptimiser(val):
        return max(val[1][1]/val[1][0],val[1][0]/val[1][1])

for d in dList:
    optimiseThetaDct = sorted(thetaDct.items(), key=featureOptimiser, reverse=True)[:d]
    optimisedFeaturesDct = {key: value for key, value in optimiseThetaDct}
    
    percScore, correctIndices, incorrectIndices = testNaiveBayes(test, optimisedFeaturesDct)
    print("Bag-of-words dimension d = {} scored {}%".format(d, percScore))


Bag-of-words dimension d = 50 scored 53.92%
Bag-of-words dimension d = 100 scored 60.1%
Bag-of-words dimension d = 150 scored 68.59%
Bag-of-words dimension d = 200 scored 70.86%
Bag-of-words dimension d = 250 scored 74.56%
Bag-of-words dimension d = 500 scored 81.26%


### Naive Bayes - taking d "best" words that surpass some frequency threshold

In [13]:
dList = [50,100,150,200,250,500]


def featureOptimiser(val):
    if max(val[1]) < 1*10**(-4):
        return -1
    else:
        return max(val[1][1]/val[1][0],val[1][0]/val[1][1])

for d in dList:
    optimiseThetaDct = sorted(thetaDct.items(), key=featureOptimiser, reverse=True)[:d]
    optimisedFeaturesDct = {key: value for key, value in optimiseThetaDct}
    
    percScore, correctIndices, incorrectIndices = testNaiveBayes(test, optimisedFeaturesDct)
    print("Bag-of-words dimension d = {} scored {}%".format(d, percScore))


Bag-of-words dimension d = 50 scored 67.27%
Bag-of-words dimension d = 100 scored 77.99%
Bag-of-words dimension d = 150 scored 81.19%
Bag-of-words dimension d = 200 scored 81.31%
Bag-of-words dimension d = 250 scored 81.06%
Bag-of-words dimension d = 500 scored 81.38%


In [14]:
print("The following reviews were classified incorrectly: \n")

for i in range(0,5):
    index = incorrectIndicesList[-1][i]
    print("Correct classification would have been: {}".format(df["sentiment"].iloc[index]))
    
    print(df["review"].iloc[index])
    print("\n")
    

The following reviews were classified incorrectly: 

Correct classification would have been: negative
There is not a single sympathetic character in this entire movie. Is it the lawyer played by Kenneth Branagh that we're supposed to be pulling for? Well, let's see -- we learn he's a sleazebag defense attorney who gets criminals off on technicalities. He treats his coworkers like cattle, gets them involved in his own personal crisis (in the process, getting one of them killed), jeopardizes the safety of his kids, threatens his ex-wife's new boyfriend, tries to strong-arm the police and school administrators -- and all this for what? Because he was THINKING WITH HIS LITTLE HEAD! I was really pulling for the father and his gang to beat the stuffing out of the lawyer and drown him in the swamp...it would have made for a far more satisfying ending.


Correct classification would have been: positive
This picture for me scores very highly as it is a hugely enjoyable and amusing spoof of Alie