In [1]:
import csv                               # csv reader
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier, accuracy
from random import shuffle
from sklearn.pipeline import Pipeline
import math
import numpy as np
from sklearn.metrics import precision_recall_fscore_support, precision_score
import string

In [2]:
# load data from a file and append it to the rawData
def loadData(path, Text=None):
    with open(path) as f:
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
            (Id, Text, Label) = parseReview(line)
            rawData.append((Id, Text, Label))
            preprocessedData.append((Id, preProcess(Text), Label))
        
def splitData(percentage):
    dataSamples = len(rawData)
    halfOfData = int(len(rawData)/2)
    trainingSamples = int((percentage*dataSamples)/2)
    for (_, Text, Label) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
        trainData.append((toFeatureVector(preProcess(Text)),Label))
    for (_, Text, Label) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
        testData.append((toFeatureVector(preProcess(Text)),Label))

In [3]:
# QUESTION 1
# Convert line from input file into an id/text/label tuple
def parseReview(reviewLine):
    # Should return a triple of an integer, a string containing the review, and a string indicating the label
#     print(reviewLine)
#     result = reviewLine.split('\t')
    if reviewLine[1] == '__label1__':
        label = 'fake'
    else:
        label = 'real'
    return (reviewLine[0], reviewLine[8], label)

# sample_line = "1	__label1__	4	N	PC	B00008NG7N	Targus PAUK10U Ultra Mini USB Keypad, Black	useful	When least you think so, this product will save the day. Just keep it around just in case you need it for something."
# parseReview(sample_line)

In [4]:
# TEXT PREPROCESSING AND FEATURE VECTORIZATION
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer('english')

# Input: a string of one review
def preProcess(text):
    # Should return a list of tokens
    tokens = word_tokenize(text)
    result = [stemmer.stem(t) for t in tokens]
    return result

# sample_line = "1	__label1__	4	N	PC	B00008NG7N	Targus PAUK10U Ultra Mini USB Keypad, Black	useful	When least you think so, this product will save the day. Just keep it around just in case you need it for something."
# (_, text, Label) = parseReview(sample_line)
# preProcess(text)

In [5]:
rawData = []          # the filtered data from the dataset file (should be 21000 samples)
preprocessedData = [] # the preprocessed reviews (just to see how your preprocessing is doing)
trainData = []        # the training data as a percentage of the total dataset (currently 80%, or 16800 samples)
testData = []    

loadData('amazon_reviews.txt')

In [6]:
preprocessedData

[('DOC_ID', ['review_text'], 'real'),
 ('1',
  ['when',
   'least',
   'you',
   'think',
   'so',
   ',',
   'this',
   'product',
   'will',
   'save',
   'the',
   'day',
   '.',
   'just',
   'keep',
   'it',
   'around',
   'just',
   'in',
   'case',
   'you',
   'need',
   'it',
   'for',
   'someth',
   '.'],
  'fake'),
 ('2',
  ['lithium',
   'batteri',
   'are',
   'someth',
   'new',
   'introduc',
   'in',
   'the',
   'market',
   'there',
   'averag',
   'develop',
   'cost',
   'is',
   'relat',
   'high',
   'but',
   'stallion',
   'doe',
   "n't",
   'compromis',
   'on',
   'qualiti',
   'and',
   'provid',
   'us',
   'with',
   'the',
   'best',
   'at',
   'a',
   'low',
   'cost.',
   '<',
   'br',
   '/',
   '>',
   'there',
   'are',
   'so',
   'mani',
   'in',
   'built',
   'technic',
   'assist',
   'that',
   'act',
   'like',
   'a',
   'sensor',
   'in',
   'their',
   'particular',
   'forté',
   '.',
   'the',
   'batteri',
   'keep',
   'my',
   'phon

In [7]:
rawData

[('DOC_ID', 'REVIEW_TEXT', 'real'),
 ('1',
  'When least you think so, this product will save the day. Just keep it around just in case you need it for something.',
  'fake'),
 ('2',
  "Lithium batteries are something new introduced in the market there average developing cost is relatively high but Stallion doesn't compromise on quality and provides us with the best at a low cost.<br />There are so many in built technical assistants that act like a sensor in their particular forté. The battery keeps my phone charged up and it works at every voltage and a high voltage is never risked.",
  'fake'),
 ('3',
  "I purchased this swing for my baby. She is 6 months now and has pretty much out grown it. It is very loud and doesn't swing very well. It is beautiful though. I love the colors and it has a lot of settings, but I don't think it was worth the money.",
  'fake'),
 ('4',
  'I was looking for an inexpensive desk calcolatur and here it is. It works and does everything I need. Only issue i

In [8]:
# QUESTION 2

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()


featureDict = {} # A global dictionary of features

def toFeatureVector(tokens):
    # Should return a dictionary containing features as keys, and weights as values
#     text_vec = vectorizer.fit_transform(tokens)
# text_vec.toarray()
    feature_vector = {}
    for token in tokens:
        if token in feature_vector:
            feature_vector[token] += 1
        else:
            feature_vector[token] = 1
        
        if token in featureDict:
            featureDict[token] += 1
        else:
            featureDict[token] = 1
            
    return feature_vector

In [9]:
toFeatureVector(preprocessedData[1][1])

{',': 1,
 '.': 2,
 'around': 1,
 'case': 1,
 'day': 1,
 'for': 1,
 'in': 1,
 'it': 2,
 'just': 2,
 'keep': 1,
 'least': 1,
 'need': 1,
 'product': 1,
 'save': 1,
 'so': 1,
 'someth': 1,
 'the': 1,
 'think': 1,
 'this': 1,
 'when': 1,
 'will': 1,
 'you': 2}

In [10]:
# TRAINING AND VALIDATING OUR CLASSIFIER
def trainClassifier(trainData):
    print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC())])
    return SklearnClassifier(pipeline).train(trainData)

In [155]:
# QUESTION 3
from sklearn.model_selection import KFold


def crossValidate(dataset, folds):
    shuffle(dataset)
    cv_results = []
    foldSize = math.ceil(len(dataset)/folds)
    
    kf = KFold(n_splits=folds)
    
    scores = np.array([0,0,0,0])
    
    for train_index, test_index in kf.split(dataset):
#         print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = dataset[train_index], dataset[test_index]
        classifier = trainClassifier(X_train)
#         print(X_test[0])
        y_pred = predictLabels_cv(X_test, classifier)
        y_true = X_test[:, 1]
        
        acc = accuracy(classifier, X_test)
        print("MultinomialNB accuracy percent:", acc)
#         print('y true size', y_true.shape, 'y pred size', len(y_pred))
#         print(y_pred)
        prfs = precision_recall_fscore_support(y_true, y_pred, average='weighted')
        print(prfs)
#         print('precision_score',precision_score(y_true, y_pred))
        scores = scores + np.array([prfs[0], prfs[1], prfs[2], acc])
#         scores = {'precision': prfs[0], 'recall': prfs[1],
#                   'f1': prfs[2], 'accuracy': acc}
#         cv_results.append(scores)
    

    scores = scores / folds
    cv_results = {'precision': scores[0], 'recall': scores[1],
                  'f1': scores[2], 'accuracy': scores[3]}
    return cv_results

In [157]:
train_np = np.array(trainData[1:])
print(len(trainData))
crossValidate(train_np, 2)

16800
Training Classifier...
MultinomialNB accuracy percent: 0.8075
(0.80781218369718832, 0.8075, 0.80748288552983649, None)
Training Classifier...
MultinomialNB accuracy percent: 0.7396118585545899
(0.73992444222512321, 0.73961185855458988, 0.73957697441502623, None)


{'accuracy': 0.77355592927729488,
 'f1': 0.77352992997243142,
 'precision': 0.77386831296115577,
 'recall': 0.77355592927729488}

In [50]:
train_np

array([[ {'i': 2, 'thought': 1, 'the': 2, 'movi': 1, 'unfold': 1, 'beauti': 2, 'in': 1, 'a': 3, 'not': 1, 'such': 1, 'realiti': 1, 'it': 1, 'depict': 1, '.': 3, 'especi': 1, 'like': 1, 'demi': 1, 'moor': 1, 'and': 2, 'think': 1, 'she': 1, 'did': 1, 'pretti': 1, 'good': 1, 'job': 1, 'play': 1, 'lola': 1, "'s": 1, 'mom': 1, 'parallel': 1, 'drawn': 1, 'between': 1, '4': 1, 'generat': 1, 'of': 1, 'women': 1, 'is': 1, 'appar': 1, 'fun': 1, 'to': 1, 'watch': 1},
        'fake'],
       [ {'i': 2, 'thought': 1, 'the': 2, 'movi': 1, 'unfold': 1, 'beauti': 2, 'in': 1, 'a': 3, 'not': 1, 'such': 1, 'realiti': 1, 'it': 1, 'depict': 1, '.': 3, 'especi': 1, 'like': 1, 'demi': 1, 'moor': 1, 'and': 2, 'think': 1, 'she': 1, 'did': 1, 'pretti': 1, 'good': 1, 'job': 1, 'play': 1, 'lola': 1, "'s": 1, 'mom': 1, 'parallel': 1, 'drawn': 1, 'between': 1, '4': 1, 'generat': 1, 'of': 1, 'women': 1, 'is': 1, 'appar': 1, 'fun': 1, 'to': 1, 'watch': 1},
        'fake'],
       [ {'i': 2, 'thought': 1, 'the': 2, 'm

In [142]:
# PREDICTING LABELS GIVEN A CLASSIFIER

def predictLabels(reviewSamples, classifier):
    return classifier.classify_many(map(lambda t: toFeatureVector(preProcess(t[1])), reviewSamples))

def predictLabels_cv(reviewSamples, classifier):
    return classifier.classify_many(map(lambda t: t[0], reviewSamples))


def predictLabel(reviewSample, classifier):
    return classifier.classify(toFeatureVector(preProcess(reviewSample)))

In [158]:
# MAIN

# loading reviews
rawData = []          # the filtered data from the dataset file (should be 21000 samples)
preprocessedData = [] # the preprocessed reviews (just to see how your preprocessing is doing)
trainData = []        # the training data as a percentage of the total dataset (currently 80%, or 16800 samples)
testData = []         # the test data as a percentage of the total dataset (currently 20%, or 4200 samples)

# the output classes
fakeLabel = 'fake'
realLabel = 'real'

# references to the data files
reviewPath = 'amazon_reviews.txt'

## Do the actual stuff
# We parse the dataset and put it in a raw data list
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing the dataset...",sep='\n')
loadData(reviewPath) 
# We split the raw dataset into a set of training data and a set of test data (80/20)
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing training and test data...",sep='\n')
splitData(0.8)
# We print the number of training samples and the number of features
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Training Samples: ", len(trainData), "Features: ", len(featureDict), sep='\n')

Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 21001 rawData, 0 trainData, 0 testData
Preparing training and test data...
Now 21001 rawData, 16800 trainData, 4201 testData
Training Samples: 
16800
Features: 
33655


## question 4