Mount Google Drive and import necessary packages.

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')
data_dir = 'gdrive/My Drive/Colab Notebooks/AuthorshipAttribution/data' # @param {type:"string"}

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
import os
import json
import codecs
import operator
import re
import string
import argparse
import numpy as np
from collections import defaultdict
import pickle

In [0]:
import nltk
from nltk.tokenize import word_tokenize, WordPunctTokenizer,PunktSentenceTokenizer, TreebankWordTokenizer
from nltk.corpus import stopwords, webtext
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tag import pos_tag
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('webtext')
nltk.download('vader_lexicon')
nltk.download('stopwords')



[nltk_data] Downloading package webtext to /root/nltk_data...
[nltk_data]   Unzipping corpora/webtext.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [0]:
from sklearn.preprocessing import scale
from sklearn import utils
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC ,SVC
from sklearn import preprocessing
from sklearn.multiclass import OneVsRestClassifier
from sklearn.calibration import CalibratedClassifierCV

Function to calculate the n-gram frequency of a string of text.

In [0]:
# calculating n-gram frequency for a review text
def ngram_represent_text(text,n):
    if n>0:
        tokens = [text[i:i+n] for i in range(len(text)-n+1)]
    frequency = defaultdict(int)
    for token in tokens:
        frequency[token] += 1
    return frequency

Function to get the n-gram vocabulary of the dataset.

In [0]:
# constructing n-gram vocabulary from texts - calculating n-gram frequencies and storing them in dictionaries
def ngram_extract_vocabulary(texts,n,ft):
    occurrences=defaultdict(int)
    for text in texts:
        text_occurrences=ngram_represent_text(text,n)
        for ngram in text_occurrences:
            if ngram in occurrences:
                occurrences[ngram]+=text_occurrences[ngram]
            else:
                occurrences[ngram]=text_occurrences[ngram]
    vocabulary=[]
    for i in occurrences.keys():
        if occurrences[i]>=ft:
            vocabulary.append(i)
    return vocabulary

Set hyperparameters here.

In [0]:
# hyperparameters
ft = 5          # low frequency threshold. only n-grams of frequency >= ft are added to the vocabulary
n = 4           # n-gram value

We use the MaxAbsScaler to scale each feature by its Maximum Absolute Value. It translates each feature individually such that the maximal absolute value of each feature in the training set will be 1.0. It does not shift/center the data, and thus does not destroy any sparsity.

In [0]:
max_abs_scaler = preprocessing.MaxAbsScaler()
stopwords_list = {'en': set(stopwords.words('english')) , 'fr':set(stopwords.words('french')),
                  'sp': set(stopwords.words('spanish')) , 'it':set(stopwords.words('italian'))}

Load and preprocess training and testing data.

In [0]:
# placeholders for training and testing data
train_set , train_labels = [], []
test_set , test_labels = [] , []

# reading training and testing data from pickle files
train_data, test_data = None, None
with open(data_dir + "/train_data.pickle", "rb") as f:
    train_data = pickle.load(f)
with open(data_dir + "/test_data.pickle", "rb") as f:
    test_data = pickle.load(f)

# populating training and testing placeholders with data
train_set = train_data['train_texts']
train_labels = train_data['train_labels']
test_set = test_data['test_texts']
test_labels = test_data['test_labels']

In [0]:
def get_chunks(l, n):
    n = max(1, n)
    return [l[i:i+n] for i in range(0, len(l), n)]

from statistics import mean
word_counts = [text.count(" ") for text in test_set]
mean(word_counts)

# 182 words is quite short
# Try to join 5 tests texts together
longer_test_texts = get_chunks(test_set, 5)
longer_test_labels = get_chunks(test_labels, 5)

all([len(set(x)) == 1 for x in longer_test_labels])  # Make sure that all combined labels are the same

test_set = ['\n'.join(chunk) for chunk in longer_test_texts]
test_labels = [chunk[0] for chunk in longer_test_labels]

In [0]:
author_dict = {}
n_parts = 10
for review, author in zip(train_set, train_labels):
    n_chars = len(review) // n_parts
    author_dict[author] = [review[i:i+n_chars] for i in range(0, len(review), n_chars)]

new_train_set, new_train_labels = [], []
for author, reviews in author_dict.items():
    new_train_set.extend(reviews)
    new_train_labels.extend([author] * len(reviews))

# for author, reviews in author_dict.items():
#     print('author: {}\t\tnum_reviews: {}'.format(author, len(reviews)))
# print('len(new_train_set): {}\t\tlen(new_train_labels): {}'.format(len(new_train_set), len(new_train_labels)))

train_set = new_train_set
train_labels = new_train_labels

Vectorize the training and testing data.

In [0]:
# training and predicting using n-gram model (which uses SVC)
ngram_vocabulary = ngram_extract_vocabulary(train_set , n , ft)
ngram_vectorizer = CountVectorizer(strip_accents=False, analyzer='char',ngram_range=(n,n),lowercase=False,vocabulary=ngram_vocabulary)  
ngram_train_data = ngram_vectorizer.fit_transform(train_set)
ngram_train_data = ngram_train_data.astype(float)

In [0]:
for i in range(len(train_set)):
    ngram_train_data[i]=ngram_train_data[i]/len(train_set[i])
ngram_test_data = ngram_vectorizer.transform(test_set)
ngram_test_data = ngram_test_data.astype(float)
for i in range(len(test_set)):
    ngram_test_data[i] = ngram_test_data[i]/len(test_set[i])

In [0]:
ngram_scaled_train_data = max_abs_scaler.fit_transform(ngram_train_data)
ngram_scaled_test_data = max_abs_scaler.transform(ngram_test_data)

Fit the vectorized training data to an SVC model and predict on the vecorized testing data.

In [0]:
ngram_clf = CalibratedClassifierCV(OneVsRestClassifier(SVC(C=0.01 , kernel='linear')))
ngram_clf.fit(ngram_scaled_train_data, train_labels)
ngram_predictions = ngram_clf.predict(ngram_scaled_test_data)
ngram_proba = ngram_clf.predict_proba(ngram_scaled_test_data)



Compute accuracy.

In [0]:
from sklearn.metrics import accuracy_score
accuracy_score(test_labels, ngram_predictions)

0.996