In [15]:
import numpy as np
import pandas as pd
import nltk
from nltk.stem import PorterStemmer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder

filelist = ['yelp_labelled.txt','imdb_labelled.txt','amazon_cells_labelled.txt']
data = [pd.read_table(file, sep ='\t',header=None,names=['sentence','class']) for file in filelist]

big_data = pd.concat(data)

big_data['sentence'] = big_data['sentence'].apply(nltk.word_tokenize)          #performing word tokenization

In [None]:
#performing word normalization

big_data['sentence'] = big_data.sentence.map(lambda x: x.lower())            #converting all words to lowercase
big_data['sentence'] = big_data.sentence.str.replace('[^\w\s]', '')           #removing punctuations from sentences


big_data['sentence'] = big_data['sentence'].apply(nltk.word_tokenize)          #performing word tokenization

#performing word stemming
stemmer = PorterStemmer()                                                        
big_data['sentence'] = big_data['sentence'].apply(lambda x: [stemmer.stem(y) for y in x]) 

In [16]:
#This section is used for feature extraction
# This converts the list of words into space-separated strings
big_data['sentence'] = big_data['sentence'].apply(lambda x: ' '.join(x))

print(big_data['sentence'])

vectorizer = CountVectorizer()                                             #an algorithm for extracting the features
counts = vectorizer.fit_transform(big_data['sentence'])


#this is for assigning weights to words in terms of frequency and importance
transformer = TfidfTransformer().fit(counts)
counts = transformer.transform(counts) 


0                             Wow ... Loved this place .
1                                    Crust is not good .
2             Not tasty and the texture was just nasty .
3      Stopped by during the late May bank holiday of...
4      The selection on the menu was great and so wer...
5        Now I am getting angry and I want my damn pho .
6               Honeslty it did n't taste THAT fresh . )
7      The potatoes were like rubber and you could te...
8                             The fries were great too .
9                                        A great touch .
10                             Service was very prompt .
11                                   Would not go back .
12     The cashier had no care what so ever on what I...
13     I tried the Cape Cod ravoli , chicken , with c...
14     I was disgusted because I was pretty sure that...
15     I was shocked because no signs indicate cash o...
16                                  Highly recommended .
17               Waitress was a

In [13]:
#vectorizing the test file
testfile = 'test_sentences.txt'

test_data = pd.read_table(testfile, sep='\t',header=None,names=['sentence'])

test_data['sentence'] = test_data['sentence'].apply(nltk.word_tokenize)

test_data['sentence'] = test_data['sentence'].apply(lambda x: ' '.join(x))

#an algorithm for extracting the features
newcount = vectorizer.fit_transform(test_data['sentence'])


0                 This GPS tracker works like a charm .
1     When I opened the box the product was not in t...
2          Everyone should have one who owns a computer
3                                    Buy something else
4     Pure junk do not buy ever the greatest load of...
5        The DataVac was used and full of dust and dirt
6     Not so great ... bought to clean the bobbin ca...
7     It is a great size , I keep it in my desk draw...
8     I just bought this Vacuum . It 's just good fo...
9     This is just perfect for vacuuming out the lin...
10    I use it mostly to vacuum threads on the sewin...
11    I have found this mini vac . to be everything ...
12    I ordered the Pork Prime Rib Chop it was beaut...
13    A bastion of fine dining in The City for 20 ye...
14    Took my brand new bmw in for service . When I ...
15    I 'll never buy another car from this location...
16    Service Department , once the crown jewel of t...
17    Kevin is very friendly , accommodating and

In [19]:
#this section is used for training the naive bayes classifier

#the data is first split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(counts, big_data['class'], test_size=0.1, random_state=69)

#the training set is used to train the classifier
model = MultinomialNB().fit(X_train, y_train) 

#this section is used for evaluating the naive bayes classifier

predicted = model.predict(newcount)


ValueError: dimension mismatch

In [None]:
#this section is used for training the logistic regression classifier

#the data is first split into training and testing data
x_train, x_test, y_train, y_test = train_test_split(counts, big_data['class'], test_size=0.1, random_state=69)

logisticRegr = LogisticRegression()

#the actual training
model = logisticRegr.fit(x_train, y_train)


#this section is used for evaluating the logistic regression classifier

predicted = model.predict(x_test)
print(accuracy_score(y_test,predicted) * 100) 
print(confusion_matrix(y_test, predicted)) 


In [None]:
result = open("results.txt", "w")

for val in predicted:
    result.write(str(val) + '\n')
    
result.close()
print('Check current folder for your results file')