In [1]:
import csv                               # csv reader
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from random import shuffle
from sklearn.pipeline import Pipeline

In [2]:
# load data from a file and append it to the rawData
def loadData(path, Text=None):
    with open(path,encoding='utf8') as f:
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
            if line[0] == "DOC_ID":  # skip the header
                continue
            (Id, Text, Label) = parseReview(line)
            rawData.append((Id, Text, Label))


def splitData(percentage):
    # A method to split the data between trainData and testData 
    dataSamples = len(rawData)
    halfOfData = int(len(rawData)/2)
    trainingSamples = int((percentage*dataSamples)/2)
    for (_, Text, Label) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
        trainData.append((toFeatureVector(preProcess(Text)),Label))
    for (_, Text, Label) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
        testData.append((toFeatureVector(preProcess(Text)),Label))

# Question 1

In [3]:
def parseReview(reviewLine):
    if reviewLine[1]=='__label2__':
        reviewLine[1]=realLabel
    else:
        reviewLine[1]=fakeLabel
    return (reviewLine[0], reviewLine[8], reviewLine[1])

### The DocID of any record is at position 0 in the dataset. Since it is the very first data to be returned, we have written reviewLine[0]. Similarly for Review Text and Label (Output) at positions 8 and 1 respectively. Here we assigned "\_\_label2\_\_" as "Real" and "\_\_label1\_\_" as "Fake" for easy understanding.

In [4]:
# TEXT PREPROCESSING AND FEATURE VECTORIZATION
import re,nltk
def preProcess(text):
    
    #Convert to lower case
    text= text.lower()
    
    #Tokenisation
    tokens=text.split(' ')
    
    return tokens

### Tokenisation is the process of converting the text into individual entities called tokens. Here we split the text on the blank space. eg. "I am Happy" will be converted ["I","am","Happy"]. Here we have indulged in some simple normalization like converting all words to lower case. This is done because:
### eg. {"I am Happy"}, {"I am happy"}. These two sentences will be tokenised as ['I','am','Happy','happy']. Here we know that Happy=happy. Hence to avoid this we convert every letter in the text to lower form.
### However still there are some problems such as unnecessary tokens with this which are addressed and corrected in Q4 and 5 notebook.

# Question 2

In [5]:
featureDict = {} # A global dictionary of features

def toFeatureVector(tokens):
    featureDictLocal={}
    for t in tokens:
        try:
            featureDict[t] += 1
            featureDictLocal[t] += 1
        except KeyError:            
            featureDict[t] = 1
            featureDictLocal[t] = 1
    return featureDictLocal

### Here we try to assign weights to each token. In this case the weight is equivalent to the frequency of the word in text. eg. For "I am happy" and "I am sad", the feature Dictionary will be {'I':2,'am':2,'happy':1,'sad':1}. There are various ways to assign weights more of which are explored in Q4 and 5 notebook. We are maintaining two dictionaries - Local and Global. Local dictionary stores the keys and weights for a individual text in the dataset. Global stores the keys and weights (counts) for the whole dataset.

In [6]:
# TRAINING AND VALIDATING OUR CLASSIFIER
def trainClassifier(trainData):
    print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC())])
    return SklearnClassifier(pipeline).train(trainData)

# Question 3

In [7]:
from sklearn.model_selection import KFold
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
crossValidationActual=[]
def crossValidate(dataset, folds):
    shuffle(dataset)
    cv_results = []
    foldSize = int(len(dataset)/folds)
    accuracySum = 0
    for i in range(0,len(dataset),foldSize):
        crossValidationTestData = dataset[i:i+foldSize]
        crossValidationTrainData = dataset[:i] + dataset[i+foldSize:]
        classifier = trainClassifier(crossValidationTrainData)
        crossValidationActual = [x[1] for x in crossValidationTestData]
        crossValidationPredictedLabels = predictLabels(crossValidationTestData,classifier)
        cv_results.append(precision_recall_fscore_support(crossValidationActual, crossValidationPredictedLabels, average='weighted'))
        accuracySum+=(accuracy_score(crossValidationActual, crossValidationPredictedLabels))
    print('Average Accuracy:%f' % (accuracySum/10))
    return cv_results

### In this we use KFold cross validation. Instead of using the libraries we have manually implemented it. Here we are using 10 fold cross validation. So folds=10 in the parameters. Here foldSize is the step size for the 'for loop'. The 'for loop' begins with the assigning of test data. In the first iteration the test data will be from the first record to the record at foldSize position. eg. Assume there are 100 records with 10 folds. foldSize will be 100/10=10. Hence, the test data will be from 0 to 9. Remaining will be the training data. Using this training data we train the classifier. We then use that classifier to predict labels for the test data extracted. We already have the actual labels in the second column of the test data. We extract them and compare that with the predicted lables to get Precision, Recall, FScore and Accuracy. We then continue the loop considering the second fold i.e 10 to 19 as the testing data and remaining (0 to 9 and 20 to 99) as the training data. The same process continues till the classifier is trained using the remaining data (0 to 89) and tested using the last fold i.e. 90 to 99 . Lastly we print the average accuracy of all 10 folds.

In [8]:
# PREDICTING LABELS GIVEN A CLASSIFIER

def predictLabels(reviewSamples, classifier):
    return classifier.classify_many(map(lambda t: t[0], reviewSamples))

def predictLabel(reviewSample, classifier):
    return classifier.classify(toFeatureVector(preProcess(reviewSample)))

In [9]:
# MAIN
sumFScore=0
# loading reviews
# initialize global lists that will be appended to by the methods below
rawData = []          # the filtered data from the dataset file (should be 21000 samples)
trainData = []        # the pre-processed training data as a percentage of the total dataset (currently 80%, or 16800 samples)
testData = []         # the pre-processed test data as a percentage of the total dataset (currently 20%, or 4200 samples)

# the output classes
fakeLabel = 'fake'
realLabel = 'real'

# references to the data files
reviewPath = 'amazon_reviews.txt'

# Do the actual stuff (i.e. call the functions we've made)
# We parse the dataset and put it in a raw data list
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing the dataset...",sep='\n')
loadData(reviewPath) 

# We split the raw dataset into a set of training data and a set of test data (80/20)
# You do the cross validation on the 80% (training data)
# We print the number of training samples and the number of features before the split
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing training and test data...",sep='\n')
splitData(0.8)
# We print the number of training samples and the number of features after the split
print("After split, %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Training Samples: ", len(trainData), "Features: ", len(featureDict), sep='\n')

# QUESTION 3 - Make sure there is a function call here to the
# crossValidate function on the training set to get your results
validationResults=crossValidate(trainData,10)
for i in range(len(validationResults)):
    print('Fold ' + str(i+1) + ': \nPrecision: %f\tRecall: %f\tF Score:%f' % validationResults[i][:3])
    sumFScore+=validationResults[i][2]
print('Average FScore:%f' % (sumFScore/10))

Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 21000 rawData, 0 trainData, 0 testData
Preparing training and test data...
After split, 21000 rawData, 16800 trainData, 4200 testData
Training Samples: 
16800
Features: 
78556
Training Classifier...
Training Classifier...




Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Average Accuracy:0.635893
Fold 1: 
Precision: 0.624409	Recall: 0.624405	F Score:0.624407
Fold 2: 
Precision: 0.665350	Recall: 0.664881	F Score:0.664827
Fold 3: 
Precision: 0.623044	Recall: 0.622024	F Score:0.622004
Fold 4: 
Precision: 0.641300	Recall: 0.641071	F Score:0.641020
Fold 5: 
Precision: 0.631626	Recall: 0.631548	F Score:0.631210
Fold 6: 
Precision: 0.635373	Recall: 0.635119	F Score:0.635208
Fold 7: 
Precision: 0.629762	Recall: 0.629762	F Score:0.629762
Fold 8: 
Precision: 0.631277	Recall: 0.630952	F Score:0.630913
Fold 9: 
Precision: 0.664415	Recall: 0.663095	F Score:0.663069
Fold 10: 
Precision: 0.616485	Recall: 0.616071	F Score:0.616119
Average FScore:0.635854


### Here we can see the results for each fold. If we take out an average of each FScore and the accuracy it comes around to 0.63. This is not a good model. We need to tweak some things in order to improve score. That is done in Q4 & 5 notebook.

# Evaluate on test set

In [10]:
# Finally, check the accuracy of your classifier by training on all the tranin data
# and testing on the test set
# Will only work once all functions are complete
functions_complete = True  # set to True once you're happy with your methods for cross val
if functions_complete:
    print(testData[0])   # have a look at the first test data instance
    classifier = trainClassifier(trainData)  # train the classifier
    testTrue = [t[1] for t in testData]   # get the ground-truth labels from the data
    testPred = predictLabels(testData, classifier)  # classify the test data to get predicted labels
    finalScores = precision_recall_fscore_support(testTrue, testPred, average='weighted') # evaluate
    print("Done training!")
    print("Precision: %f\nRecall: %f\nF Score:%f" % finalScores[:3])
    print("Accuracy: %f" % accuracy_score(testTrue, testPred))

({'this': 1, 'assortment': 1, 'is': 1, 'really': 1, "hershey's": 1, 'at': 1, 'their': 1, 'best.': 1, 'the': 2, 'little': 1, 'ones': 1, 'are': 1, 'always': 1, 'excited': 1, 'whenever': 1, 'holidays': 1, 'come': 1, 'because': 1, 'of': 1, 'this.': 1}, 'fake')
Training Classifier...
Done training!
Precision: 0.625260
Recall: 0.625238
F Score:0.625221
Accuracy: 0.625238


### Here we can see that the FScore and the Average is a bit low. We need to improve that. Q4 & 5 notebook does the same.

# Questions 4 and 5
Once you're happy with your functions for Questions 1 to 3, it's advisable you make a copy of this notebook to make a new notebook, and then within it adapt and improve all three functions in the ways asked for in questions 4 and 5.

### Improvements of all functions is done in a separate notebook with title "Atharva Joshi 200425197 Q4,5".