In [51]:
import unicodecsv                               # csv reader
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from random import shuffle
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_fscore_support # to report on precision and recall
import numpy as np


import csv                               # csv reader
#nltk.download()
import nltk
nltk.data.path.append("/Users/Shared/nltk_data")

In [32]:
# load data from a file and append it to the rawData
def loadData(path, Text=None):
    with open(path) as f:
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
            (Id, Text, Label) = parseReview(line)
            rawData.append((Id, Text, Label))
            preprocessedData.append((Id, preProcess(Text), Label))
        del preprocessedData[0]
        del rawData[0]

In [33]:
def splitData(percentage):
    dataSamples = len(rawData)
    halfOfData = int(len(rawData)/2)
    trainingSamples = int((percentage*dataSamples)/2)
    for (_, Text, Label) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
        trainData.append((toFeatureVector(preProcess(Text)),Label))
    for (_, Text, Label) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
        testData.append((toFeatureVector(preProcess(Text)),Label))

In [34]:
# QUESTION 1
# Convert line from input file into an id/text/label tuple
def parseReview(reviewLine):
    # Should return a triple of an integer, a string containing the review, and a string indicating the label
    rid = reviewLine[0]
    rtext = reviewLine[8]
    rlabel = reviewLine[1]
    return (rid, rtext, rlabel)

In [35]:
# TEXT PREPROCESSING
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Text processing with Scikit-Learn, basics
# Creating a vectorizer that can be used to extract a bag of words
# representation from documents

stop_words = set(stopwords.words('english'))
wordnet_lemmatizer = WordNetLemmatizer()


#stemmer = SnowballStemmer("english")
stemmer = PorterStemmer()
# Input: a string of one review
def preProcess(text):
    # Should return a list of tokens
    text = word_tokenize(text)
    b = []
    for word in text:
        if word.isalpha(): # removing punctuation
            if word not in stop_words: # removing stopwords or "too common" words
                word = word.lower() # converting all letters to lower case
                word = wordnet_lemmatizer.lemmatize(word)
                word = stemmer.stem(word) # Using standart stemmer from the nltk
                b.append(word)
    return b

In [36]:
# QUESTION 2
featureDict = {} # A global dictionary of features
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=1)

def toFeatureVector(tokens):
# Should return a dictionary containing features as keys, and weights as values
    featureVector = {}
    for token in tokens:
        if token not in featureVector:
            featureVector[token] = 1.0
        else:
            featureVector[token] = float(featureVector[token] + 1)
            
        if token not in featureDict:
            featureDict[token] = 1.0
        else:
            featureDict[token] = float(featureDict[token] + 1)
    return featureVector

In [37]:
# TRAINING AND VALIDATING OUR CLASSIFIER
def trainClassifier(trainData):
    print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC())])
    return SklearnClassifier(pipeline).train(trainData)

In [38]:
shuffle(trainData)
trainClassifier(trainData)

Training Classifier...


<SklearnClassifier(Pipeline(memory=None,
     steps=[('svc', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))]))>

In [57]:
# QUESTION 3

def crossValidate(dataset, folds):
    shuffle(dataset)
    results = []
    foldSize = int(len(dataset)/folds)
    for i in range(0, len(dataset), foldSize):
        # insert code here that trains and tests on the 10 folds of data in the dataset
        print("fold start %d foldSize %d" % (i, foldSize))
        myTestData = dataset[i:i+foldSize]
        myTrainData = dataset[:i] + dataset[i+foldSize:]
        classifier = trainClassifier(myTrainData)
        y_true = list(map(lambda x: x[1], myTestData))
        y_pred = classifier.classify_many(map(lambda x: x[0], myTestData))
        results.append(precision_recall_fscore_support(y_true, y_pred, average='weighted'))
    return results

In [56]:
folds = 10
shuffle(trainData)
results = []
foldSize = int(len(trainData)/folds)
for i in range(0, len(trainData), foldSize):
    # insert code here that trains and tests on the 10 folds of data in the trainData
    print("fold start %d foldSize %d" % (i, foldSize))
    myTestData = trainData[i:i+foldSize]
    myTrainData = trainData[:i] + trainData[i+foldSize:]
    classifier = trainClassifier(myTrainData)
    y_true = list(map(lambda x: x[1], myTestData))
    y_pred = classifier.classify_many(map(lambda x: x[0], myTestData))
    results.append(precision_recall_fscore_support(y_true, y_pred, average='weighted'))
print(results)

fold start 0 foldSize 1680
Training Classifier...
fold start 1680 foldSize 1680
Training Classifier...
fold start 3360 foldSize 1680
Training Classifier...
fold start 5040 foldSize 1680
Training Classifier...
fold start 6720 foldSize 1680
Training Classifier...
fold start 8400 foldSize 1680
Training Classifier...
fold start 10080 foldSize 1680
Training Classifier...
fold start 11760 foldSize 1680
Training Classifier...
fold start 13440 foldSize 1680
Training Classifier...
fold start 15120 foldSize 1680
Training Classifier...
[(0.6042075023272598, 0.6035714285714285, 0.6033960950316696, None), (0.5933755386880387, 0.593452380952381, 0.5934120693591681, None), (0.5988973858274369, 0.5988095238095238, 0.598513578363463, None), (0.6184027413834805, 0.6178571428571429, 0.6177433937430394, None), (0.6092167173347504, 0.6089285714285714, 0.6084652684419097, None), (0.6215110516368766, 0.6214285714285714, 0.621117016090404, None), (0.6363330897038586, 0.6351190476190476, 0.6349937620994995, No

Make use of the given functions trainClassifier and predictLabels 
to do the cross-validation. Make sure that your program stores the (average) precision, 
recall, f1 score, and accuracy of your classifier in a variable cv_results.

In [42]:
# PREDICTING LABELS GIVEN A CLASSIFIER

def predictLabels(reviewSamples, classifier):
    return classifier.classify_many(map(lambda t: toFeatureVector(preProcess(t[1])), reviewSamples))

def predictLabel(reviewSample, classifier):
    return classifier.classify(toFeatureVector(preProcess(reviewSample)))

In [43]:
# MAIN

# loading reviews
rawData = []          # the filtered data from the dataset file (should be 21000 samples)
preprocessedData = [] # the preprocessed reviews (just to see how your preprocessing is doing)
trainData = []        # the training data as a percentage of the total dataset (currently 80%, or 16800 samples)
testData = []         # the test data as a percentage of the total dataset (currently 20%, or 4200 samples)

# the output classes
fakeLabel = 'fake'
realLabel = 'real'

# references to the data files
reviewPath = 'amazon_reviews.txt'

## Do the actual stuff
# We parse the dataset and put it in a raw data list
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)))
print("Preparing the dataset...")

loadData(reviewPath)
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)))
# We split the raw dataset into a set of training data and a set of test data (80/20)
print("Preparing training and test data...")

splitData(0.8)
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)))
# We print the number of training samples and the number of features
print("Training Samples: ", len(trainData), "Features: ", len(featureDict))

Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 21000 rawData, 0 trainData, 0 testData
Preparing training and test data...
Now 21000 rawData, 16800 trainData, 4200 testData
Training Samples:  16800 Features:  20924


In [44]:
print(trainData[-1:])

[({'even': 1.0, 'american': 1.0, 'may': 1.0, 'would': 1.0, 'back': 1.0, 'orient': 1.0, 'medium': 2.0, 'compani': 1.0, 'differ': 1.0, 'should': 1.0, 'though': 1.0, 'send': 1.0, 'fit': 1.0, 'larg': 1.0, 'bodi': 1.0, 'type': 1.0, 'come': 1.0, 'standard': 1.0, 'small': 2.0, 'ethnic': 1.0, 'realiz': 1.0, 'need': 1.0, 'order': 1.0, 'look': 1.0, 'i': 2.0, 'hen': 1.0}, '__label2__')]


In [58]:
cv_results = crossValidate(trainData, 10)
print(cv_results)

fold start 0 foldSize 1680
Training Classifier...
fold start 1680 foldSize 1680
Training Classifier...
fold start 3360 foldSize 1680
Training Classifier...
fold start 5040 foldSize 1680
Training Classifier...
fold start 6720 foldSize 1680
Training Classifier...
fold start 8400 foldSize 1680
Training Classifier...
fold start 10080 foldSize 1680
Training Classifier...
fold start 11760 foldSize 1680
Training Classifier...
fold start 13440 foldSize 1680
Training Classifier...
fold start 15120 foldSize 1680
Training Classifier...
[(0.6170947648264996, 0.6172619047619048, 0.6170993542151644, None), (0.6207733449158352, 0.6208333333333333, 0.6207062890573062, None), (0.5973646411685349, 0.5964285714285714, 0.5963244683942054, None), (0.6020615408785907, 0.6017857142857143, 0.6016622326119451, None), (0.6226769803785436, 0.6196428571428572, 0.6191689327665767, None), (0.6058702416279488, 0.6059523809523809, 0.605841730465693, None), (0.6016017407253402, 0.6, 0.5995689166193987, None), (0.60341

In [60]:
classifier = trainClassifier(trainData)

Training Classifier...


In [100]:
cv_results = np.asarray(cv_results)

In [101]:
print(cv_results)

[[0.6170947648264996 0.6172619047619048 0.6170993542151644 None]
 [0.6207733449158352 0.6208333333333333 0.6207062890573062 None]
 [0.5973646411685349 0.5964285714285714 0.5963244683942054 None]
 [0.6020615408785907 0.6017857142857143 0.6016622326119451 None]
 [0.6226769803785436 0.6196428571428572 0.6191689327665767 None]
 [0.6058702416279488 0.6059523809523809 0.605841730465693 None]
 [0.6016017407253402 0.6 0.5995689166193987 None]
 [0.6034181373990798 0.6029761904761904 0.6028719415119285 None]
 [0.6134073565323565 0.6136904761904762 0.6132822348437273 None]
 [0.6042558764577259 0.6041666666666666 0.6037375695263779 None]]


In [108]:
print("Current average precision is " + str(np.mean(cv_results[:,0], axis=0)))
print("Current average recall is " + str(np.mean(cv_results[:,1], axis=0)))
print("Current average fscore is " + str(np.mean(cv_results[:,2], axis=0)))
print("Current average fscore is None")

Current average precision is 0.6088524624910455
Current average recall is 0.6082738095238096
Current average fscore is 0.6080263670012324
Current average fscore is None
