In [1]:
import pandas as pd 
import os 
import nltk 
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/advaitmarathe/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
folder = "aclImdb"
labels = {'pos':1, 'neg':0}

In [3]:
df = pd.DataFrame()

In [4]:
for fold in ['test','train']:
    for sent in ['pos','neg']:
        path = os.path.join(folder,fold,sent)
        for file in os.listdir(path):
            with open(os.path.join(path,file),'r', encoding = 'utf-8') as infile:
                txt = infile.read()
                df = df.append([[txt,labels[sent]]], ignore_index = True)


In [13]:
df.columns

Int64Index([0, 1], dtype='int64')

In [11]:
df.rename(columns = { '0': 'Reviews', '1': 'Sentiment' }, inplace=True)


In [14]:
df.columns = ['Review','Sentiment']

In [15]:
df

Unnamed: 0,Review,Sentiment
0,"Based on an actual story, John Boorman shows t...",1
1,This is a gem. As a Film Four production - the...,1
2,"I really like this show. It has drama, romance...",1
3,This is the best 3-D experience Disney has at ...,1
4,"Of the Korean movies I've seen, only three had...",1
...,...,...
49995,"My comments may be a bit of a spoiler, for wha...",0
49996,"The ""saucy"" misadventures of four au pairs who...",0
49997,"Oh, those Italians! Assuming that movies about...",0
49998,Eight academy nominations? It's beyond belief....,0


In [16]:
df.to_csv("movie_data.csv",index = False, encoding ='utf-8')

The top part is used to read the data and put it into a csv file.

In [2]:
df = pd.read_csv("movie_data.csv")

In [3]:
df

Unnamed: 0,Review,Sentiment
0,"Based on an actual story, John Boorman shows t...",1
1,This is a gem. As a Film Four production - the...,1
2,"I really like this show. It has drama, romance...",1
3,This is the best 3-D experience Disney has at ...,1
4,"Of the Korean movies I've seen, only three had...",1
...,...,...
49995,"My comments may be a bit of a spoiler, for wha...",0
49996,"The ""saucy"" misadventures of four au pairs who...",0
49997,"Oh, those Italians! Assuming that movies about...",0
49998,Eight academy nominations? It's beyond belief....,0


In [4]:
#separate all the reviews into words
reviews = df.Review.str.cat(sep=' ')

In [10]:
reviews[0:1000]

'Based on an actual story, John Boorman shows the struggle of an American doctor, whose husband and son were murdered and she was continually plagued with her loss. A holiday to Burma with her sister seemed like a good idea to get away from it all, but when her passport was stolen in Rangoon, she could not leave the country with her sister, and was forced to stay back until she could get I.D. papers from the American embassy. To fill in a day before she could fly out, she took a trip into the countryside with a tour guide. "I tried finding something in those stone statues, but nothing stirred in me. I was stone myself." <br /><br />Suddenly all hell broke loose and she was caught in a political revolt. Just when it looked like she had escaped and safely boarded a train, she saw her tour guide get beaten and shot. In a split second she decided to jump from the moving train and try to rescue him, with no thought of herself. Continually her life was in danger. <br /><br />Here is a woman 

In [8]:
def preprocess(sentence):
    sentence = sentence.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(sentence)
    filtered_words = [w for w in tokens if not w in stopwords.words('english')]
    return filtered_words

KeyboardInterrupt: 

In [12]:
tokens = word_tokenize(reviews)

In [13]:
vocabulary = set(tokens)
print(len(vocabulary))

198763


In [30]:
#finds the most frequent words in the text
frequency_dist = nltk.FreqDist(tokens)
sorted(frequency_dist,key=frequency_dist.__getitem__,reverse = True)[0:10]

['the', ',', '.', 'a', 'and', 'of', 'to', 'is', '/', '>']

In [36]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/advaitmarathe/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [25]:
from nltk.corpus import stopwords
import string
stop_words = set(stopwords.words('english'))
tokens = [w for w in tokens if not w.lower() in stop_words]
tokens = [w for w in tokens if w.isalpha()]

In [26]:
frequency_dist2 = nltk.FreqDist(tokens)
sorted(frequency_dist2,key=frequency_dist2.__getitem__,reverse = True)[0:10]

['movie',
 'film',
 'one',
 'like',
 'good',
 'would',
 'time',
 'really',
 'see',
 'even']

In [29]:
len(set(tokens))

123451

In [32]:
#create train and test sets from the data given
X_train = df.loc[:24999, 'Review'].values
y_train = df.loc[:24999, 'Sentiment'].values
X_test = df.loc[25000:, 'Review'].values
y_test = df.loc[25000:, 'Sentiment'].values

In [33]:
#import TF-IDF vectorizer turning the sentences into word vectors based on TF-IDF
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [34]:
#training the vectorizer on the training set of x
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)
print(train_vectors.shape, test_vectors.shape)

(25000, 73822) (25000, 73822)


In [35]:
from sklearn.naive_bayes import MultinomialNB
#Using a Naive Bayes classifier to fit the training data
clf = MultinomialNB().fit(train_vectors, y_train)

In [36]:
from  sklearn.metrics import accuracy_score
predicted = clf.predict(test_vectors)
print(accuracy_score(y_test,predicted))

0.83664
