In [1]:
import unicodecsv                               # csv reader
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from random import shuffle
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_fscore_support # to report on precision and recall
import numpy as np
import pandas as pd


import csv                               # csv reader
#nltk.download()
import nltk
nltk.data.path.append("/Users/Shared/nltk_data")

In [2]:
# load data from a file and append it to the rawData
def loadData(path, Text=None):
    with open(path) as f:
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
            (Id, Text, Label) = parseReview(line)
            rawData.append((Id, Text, Label))
            preprocessedData.append((Id, preProcess(Text), Label))
        del preprocessedData[0]
        del rawData[0]
                
def splitData(percentage):
    dataSamples = len(rawData)
    halfOfData = int(len(rawData)/2)
    trainingSamples = int((percentage*dataSamples)/2)
    for (_, Text, Label) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
        trainData.append((toFeatureVector(preProcess(Text)), Label))
    for (_, Text, Label) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
        testData.append((toFeatureVector(preProcess(Text)), Label))

In [3]:
# QUESTION 1
# Convert line from input file into an id/text/label tuple
def parseReview(reviewLine):
    # Should return a triple of an integer, a string containing the review, and a string indicating the label
    rid = reviewLine[0]
    rtext = reviewLine[8]
    rlabel = reviewLine[1]
    return (rid, rtext, rlabel)

In [4]:
# TEXT PREPROCESSING AND FEATURE VECTORISATION
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Text processing with Scikit-Learn, basics
# Creating a vectorizer that can be used to extract a bag of words
# representation from documents

stop_words = set(stopwords.words('english'))
wordnet_lemmatizer = WordNetLemmatizer()


#stemmer = SnowballStemmer("english")
stemmer = PorterStemmer()
# Input: a string of one review
def preProcess(text):
    # Should return a list of tokens
    text = word_tokenize(text)
    b = []
    for word in text:
        if word.isalpha(): # removing punctuation
            if word not in stop_words: # removing stopwords or "too common" words
                word = word.lower() # converting all letters to lower case
                word = wordnet_lemmatizer.lemmatize(word)
                word = stemmer.stem(word) # Using standart stemmer from the nltk
                b.append(word)
    return b

In [5]:
# QUESTION 2
featureDict = {} # A global dictionary of features
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=1)

def toFeatureVector(tokens):
# Should return a dictionary containing features as keys, and weights as values
    featureVector = {}
    for token in tokens:
        if token not in featureVector:
            featureVector[token] = 1.0
        else:
            featureVector[token] = float(featureVector[token] + 1)
            
        if token not in featureDict:
            featureDict[token] = 1.0
        else:
            featureDict[token] = float(featureDict[token] + 1)
    return featureVector

In [6]:
# TRAINING AND VALIDATING OUR CLASSIFIER
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()

def trainClassifier(trainData):
    print("Training Classifier...")
    #Playing with this parameters, increase average fscore from 61 to 64, optimal parameters set
    pipeline =  Pipeline([('svc', LinearSVC(C = 0.001, class_weight = "balanced"))])
    return SklearnClassifier(pipeline).train(trainData)

In [7]:
# QUESTION 3
def crossValidate(dataset, folds):
    shuffle(dataset)
    cv_results = []
    foldSize = int(len(dataset)/folds)
    for i in range(0, len(dataset), foldSize):
        # insert code here that trains and tests on the 10 folds of data in the dataset
        print("fold start %d foldSize %d" % (i, foldSize))
        myTestData = dataset[i:i + foldSize]
        myTrainData = dataset[:i] + dataset[i+foldSize:]
        classifier = trainClassifier(myTrainData)
        y_true = list(map(lambda x: x[1], myTestData))
        y_pred = classifier.classify_many(map(lambda x: x[0], myTestData))
        cv_results.append(precision_recall_fscore_support(y_true, y_pred, average = 'weighted'))
    return cv_results

In [8]:
# PREDICTING LABELS GIVEN A CLASSIFIER
def predictLabels(reviewSamples, classifier):
    return classifier.classify_many(map(lambda t: toFeatureVector(preProcess(t[1])), reviewSamples))

def predictLabel(reviewSample, classifier):
    return classifier.classify(toFeatureVector(preProcess(reviewSample)))

In [9]:
# MAIN

# loading reviews
rawData = []          # the filtered data from the dataset file (should be 21000 samples)
preprocessedData = [] # the preprocessed reviews (just to see how your preprocessing is doing)
trainData = []        # the training data as a percentage of the total dataset (currently 80%, or 16800 samples)
testData = []         # the test data as a percentage of the total dataset (currently 20%, or 4200 samples)

# the output classes
fakeLabel = 'fake'
realLabel = 'real'

# references to the data files
reviewPath = 'amazon_reviews.txt'

## Do the actual stuff
# We parse the dataset and put it in a raw data list
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)))
print("Preparing the dataset...")

loadData(reviewPath)
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)))
# We split the raw dataset into a set of training data and a set of test data (80/20)
print("Preparing training and test data...")

splitData(0.8)
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)))
# We print the number of training samples and the number of features
print("Training Samples: ", len(trainData), "Features: ", len(featureDict))

Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 21000 rawData, 0 trainData, 0 testData
Preparing training and test data...
Now 21000 rawData, 16800 trainData, 4200 testData
Training Samples:  16800 Features:  20929


In [10]:
cv_results = crossValidate(trainData, 10)
print(cv_results)

fold start 0 foldSize 1680
Training Classifier...
fold start 1680 foldSize 1680
Training Classifier...
fold start 3360 foldSize 1680
Training Classifier...
fold start 5040 foldSize 1680
Training Classifier...
fold start 6720 foldSize 1680
Training Classifier...
fold start 8400 foldSize 1680
Training Classifier...
fold start 10080 foldSize 1680
Training Classifier...
fold start 11760 foldSize 1680
Training Classifier...
fold start 13440 foldSize 1680
Training Classifier...
fold start 15120 foldSize 1680
Training Classifier...
[(0.6317816050605729, 0.63154761904761902, 0.63046585290449342, None), (0.63487410030976676, 0.63511904761904758, 0.63480710834678722, None), (0.65710444619836883, 0.65476190476190477, 0.65428890275217799, None), (0.65193263205105023, 0.65119047619047621, 0.65029116935604281, None), (0.6301167836952114, 0.62916666666666665, 0.62857650193239711, None), (0.65304856575484949, 0.65119047619047621, 0.65063619441102349, None), (0.63793369056543769, 0.63749999999999996, 0

In [11]:
cv_results = np.asarray(cv_results)
print(cv_results)

[[0.6317816050605729 0.63154761904761902 0.63046585290449342 None]
 [0.63487410030976676 0.63511904761904758 0.63480710834678722 None]
 [0.65710444619836883 0.65476190476190477 0.65428890275217799 None]
 [0.65193263205105023 0.65119047619047621 0.65029116935604281 None]
 [0.6301167836952114 0.62916666666666665 0.62857650193239711 None]
 [0.65304856575484949 0.65119047619047621 0.65063619441102349 None]
 [0.63793369056543769 0.63749999999999996 0.63733670123649677 None]
 [0.65855235049559191 0.65773809523809523 0.6576297906112093 None]
 [0.62689364207221354 0.62678571428571428 0.62616593868181247 None]
 [0.66233038805126243 0.65892857142857142 0.65823206804358581 None]]


In [12]:
print("Current average precision is " + str(np.mean(cv_results[:,0], axis=0)))
print("Current average recall is " + str(np.mean(cv_results[:,1], axis=0)))
print("Current average fscore is " + str(np.mean(cv_results[:,2], axis=0)))

Current average precision is 0.644456820425
Current average recall is 0.643392857143
Current average fscore is 0.642843022828
