## question 4

In [33]:
import csv                               # csv reader
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier, accuracy
from random import shuffle
from sklearn.pipeline import Pipeline
import math
import numpy as np
from sklearn.metrics import precision_recall_fscore_support
from nltk.tokenize import word_tokenize, sent_tokenize, RegexpTokenizer
from nltk.stem import SnowballStemmer
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import string

In [34]:
# load data from a file and append it to the rawData
def loadData(path, Text=None):
    with open(path) as f:
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
            (Id, Text, Label) = parseReview(line)
            rawData.append((Id, Text, Label))
            preprocessedData.append((Id, preProcess(Text), Label))
        
def splitData(percentage):
    dataSamples = len(rawData)
    halfOfData = int(len(rawData)/2)
    trainingSamples = int((percentage*dataSamples)/2)
    for (_, Text, Label) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
        trainData.append((toFeatureVector(preProcess(Text)),Label))
    for (_, Text, Label) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
        testData.append((toFeatureVector(preProcess(Text)),Label))

# QUESTION 1
# Convert line from input file into an id/text/label tuple
def parseReview(reviewLine):
    # Should return a triple of an integer, a string containing the review, and a string indicating the label
#     print(reviewLine)
#     result = reviewLine.split('\t')
    if reviewLine[1] == '__label1__':
        label = 'fake'
    else:
        label = 'real'
    return (reviewLine[0], reviewLine[8], label)

stemmer = SnowballStemmer('english')
stopWords = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')

# Input: a string of one review
def preProcess(text):
    # Should return a list of tokens
#     tokens = word_tokenize(text)
    tokens = tokenizer.tokenize(text)
    result = [stemmer.stem(t) for t in tokens if t not in stopWords]
    return result

# QUESTION 2
vectorizer = CountVectorizer()

featureDict = {} # A global dictionary of features

# def toFeatureVector(tokens):
#     # Should return a dictionary containing features as keys, and weights as values
# #     text_vec = vectorizer.fit_transform(tokens)
# # text_vec.toarray()
#     feature_vector = {}
#     for token in tokens:
#         if token in feature_vector:
#             feature_vector[token] += 1
#         else:
#             feature_vector[token] = 1
        
#         if token in featureDict:
#             featureDict[token] += 1
#         else:
#             featureDict[token] = 1
            
#     return feature_vector

def toFeatureVector(tokens):
    feature_vector = {}
    for i in range(len(tokens)-1):
        token = (tokens[i], tokens[i+1])
        if token in feature_vector:
            feature_vector[token] += 1
        else:
            feature_vector[token] = 1
        
        if token in featureDict:
            featureDict[token] += 1
        else:
            featureDict[token] = 1
            
    return feature_vector

# TRAINING AND VALIDATING OUR CLASSIFIER
def trainClassifier(trainData):
    print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC(loss='hinge', max_iter=3000, C=1))])
    return SklearnClassifier(pipeline).train(trainData)

# PREDICTING LABELS GIVEN A CLASSIFIER

def predictLabels(reviewSamples, classifier):
    return classifier.classify_many(map(lambda t: toFeatureVector(preProcess(t[1])), reviewSamples))

def predictLabels_cv(reviewSamples, classifier):
    return classifier.classify_many(map(lambda t: t[0], reviewSamples))

def predictLabel(reviewSample, classifier):
    return classifier.classify(toFeatureVector(preProcess(reviewSample)))

# QUESTION 3
def crossValidate(dataset, folds):
    shuffle(dataset)
    cv_results = []
    foldSize = math.ceil(len(dataset)/folds)
    
    kf = KFold(n_splits=folds)
    
    scores = np.array([0,0,0,0])
    
    for train_index, test_index in kf.split(dataset):
        X_train, X_test = dataset[train_index], dataset[test_index]
        classifier = trainClassifier(X_train)
        y_pred = predictLabels_cv(X_test, classifier)
        y_true = X_test[:, 1]
        
        acc = accuracy(classifier, X_test)
        prfs = precision_recall_fscore_support(y_true, y_pred, average='weighted')
        scores = scores + np.array([prfs[0], prfs[1], prfs[2], acc])

    scores = scores / folds
    cv_results = {'precision': scores[0], 'recall': scores[1],
                  'f1': scores[2], 'accuracy': scores[3]}
    return cv_results

In [35]:
# loading reviews
rawData = []          # the filtered data from the dataset file (should be 21000 samples)
preprocessedData = [] # the preprocessed reviews (just to see how your preprocessing is doing)
trainData = []        # the training data as a percentage of the total dataset (currently 80%, or 16800 samples)
testData = []         # the test data as a percentage of the total dataset (currently 20%, or 4200 samples)

# references to the data files
reviewPath = 'amazon_reviews.txt'

print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing the dataset...",sep='\n')
loadData(reviewPath) 
rawData = rawData[1:]
preprocessedData = preprocessedData[1:]
# We split the raw dataset into a set of training data and a set of test data (80/20)
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing training and test data...",sep='\n')
splitData(0.8)
# We print the number of training samples and the number of features
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Training Samples: ", len(trainData), "Features: ", len(featureDict), sep='\n')

Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 21000 rawData, 0 trainData, 0 testData
Preparing training and test data...
Now 21000 rawData, 16800 trainData, 4200 testData
Training Samples: 
16800
Features: 
426026


In [36]:
train_np = np.array(trainData)
print('train no',len(trainData))
crossValidate(train_np, 3)

train no 16800
Training Classifier...


  'recall', 'true', average, warn_for)


Training Classifier...
Training Classifier...


{'accuracy': 0.85351190476190475,
 'f1': 0.81643658745541003,
 'precision': 0.90290037659461564,
 'recall': 0.85351190476190475}

In [37]:
trainData

[({('around', 'case'): 1,
   ('case', 'need'): 1,
   ('day', 'just'): 1,
   ('just', 'keep'): 1,
   ('keep', 'around'): 1,
   ('least', 'think'): 1,
   ('need', 'someth'): 1,
   ('product', 'save'): 1,
   ('save', 'day'): 1,
   ('think', 'product'): 1,
   ('when', 'least'): 1},
  'fake'),
 ({('act', 'like'): 1,
   ('assist', 'act'): 1,
   ('averag', 'develop'): 1,
   ('batteri', 'keep'): 1,
   ('batteri', 'someth'): 1,
   ('best', 'low'): 1,
   ('br', 'there'): 1,
   ('built', 'technic'): 1,
   ('charg', 'work'): 1,
   ('compromis', 'qualiti'): 1,
   ('cost', 'br'): 1,
   ('cost', 'relat'): 1,
   ('develop', 'cost'): 1,
   ('everi', 'voltag'): 1,
   ('forté', 'the'): 1,
   ('high', 'stallion'): 1,
   ('high', 'voltag'): 1,
   ('introduc', 'market'): 1,
   ('keep', 'phone'): 1,
   ('like', 'sensor'): 1,
   ('lithium', 'batteri'): 1,
   ('low', 'cost'): 1,
   ('mani', 'built'): 1,
   ('market', 'averag'): 1,
   ('never', 'risk'): 1,
   ('new', 'introduc'): 1,
   ('particular', 'forté'): 