Mount Google Drive.

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')
data_dir = 'gdrive/My Drive/Colab Notebooks/AuthorshipAttribution/data' # @param {type:"string"}

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


Import stuff.

In [0]:
import os
import json
import codecs
import operator
import re
import string
import argparse
import numpy as np
from collections import defaultdict
import pickle

In [0]:
import nltk
from nltk.tokenize import word_tokenize, WordPunctTokenizer,PunktSentenceTokenizer, TreebankWordTokenizer
from nltk.corpus import stopwords, webtext
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tag import pos_tag
from nltk.sentiment.vader import SentimentIntensityAnalyzer



In [0]:
from sklearn.preprocessing import scale
from sklearn import utils
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC ,SVC
from sklearn import preprocessing
from sklearn.multiclass import OneVsRestClassifier
from sklearn.calibration import CalibratedClassifierCV

In [0]:
from gensim.models.word2vec import Word2Vec

Downloading essential NLTK material.

In [0]:
import nltk
nltk.download('webtext')
nltk.download('vader_lexicon')
nltk.download('stopwords')
# tokenize based on punctuation using nltk
punk_sent_tokenizer = PunktSentenceTokenizer(webtext.raw('overheard.txt'))

# nltk's built-in sentiment analyzer
vader = SentimentIntensityAnalyzer()

[nltk_data] Downloading package webtext to /root/nltk_data...
[nltk_data]   Unzipping corpora/webtext.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Proprocessing: remove punctuation, remove stopwords, stem the tokens, convert all resulting tokens to lowercase.

In [0]:
# text pre-processing for the tf-idf model
def tfidf_Preprocessing(text , _stopwords):
    # stemming - using nltk's PorterStemmer
    stemmer = PorterStemmer()
    text = text.replace('-' , '')
    text = text.replace('.' , '')
    text = text.replace('”' , '')
    text = text.replace('’' , '')
    text = text.replace('“' , '')
    text = text.replace('‘' , '')
    text = text.replace('–','')
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    return ' '.join([stemmer.stem(word).lower() for word in WordPunctTokenizer().tokenize(nopunc) 
                    if stemmer.stem(word).lower() not in _stopwords])

Convert text to word2vec representation.

In [0]:
def buildWordVector(imdb_w2v, text, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in text:
        try:
            vec += imdb_w2v[word].reshape((1, size))
            count += 1.
        except KeyError:
            continue
    if count != 0:
        vec /= count
    return vec

We use the MaxAbsScaler to scale each feature by its Maximum Absolute Value. It translates each feature individually such that the maximal absolute value of each feature in the training set will be 1.0. It does not shift/center the data, and thus does not destroy any sparsity.

In [0]:
max_abs_scaler = preprocessing.MaxAbsScaler()
stopwords_list = {'en': set(stopwords.words('english')) , 'fr':set(stopwords.words('french')),
                  'sp': set(stopwords.words('spanish')) , 'it':set(stopwords.words('italian'))}

Load the training and testing data and preprocess them.

In [0]:
# placeholders for training and testing data
train_set , train_labels = [], []
test_set , test_labels = [] , []

# reading training and testing data from pickle files
train_data, test_data = None, None
with open(data_dir + "/train_data.pickle", "rb") as f:
    train_data = pickle.load(f)
with open(data_dir + "/test_data.pickle", "rb") as f:
    test_data = pickle.load(f)

# populating training and testing placeholders with data
train_set = train_data['train_texts']
train_labels = train_data['train_labels']
test_set = test_data['test_texts']
test_labels = test_data['test_labels']

In [0]:
def get_chunks(l, n):
    n = max(1, n)
    return [l[i:i+n] for i in range(0, len(l), n)]

from statistics import mean
word_counts = [text.count(" ") for text in test_set]
mean(word_counts)

# 182 words is quite short
# Try to join 5 tests texts together
longer_test_texts = get_chunks(test_set, 5)
longer_test_labels = get_chunks(test_labels, 5)

all([len(set(x)) == 1 for x in longer_test_labels])  # Make sure that all combined labels are the same

test_set = ['\n'.join(chunk) for chunk in longer_test_texts]
test_labels = [chunk[0] for chunk in longer_test_labels]

In [0]:
author_dict = {}
n_parts = 10
for review, author in zip(train_set, train_labels):
    n_chars = len(review) // n_parts
    author_dict[author] = [review[i:i+n_chars] for i in range(0, len(review), n_chars)]

new_train_set, new_train_labels = [], []
for author, reviews in author_dict.items():
    new_train_set.extend(reviews)
    new_train_labels.extend([author] * len(reviews))

# for author, reviews in author_dict.items():
#     print('author: {}\t\tnum_reviews: {}'.format(author, len(reviews)))
# print('len(new_train_set): {}\t\tlen(new_train_labels): {}'.format(len(new_train_set), len(new_train_labels)))

train_set = new_train_set
train_labels = new_train_labels

In [0]:
# converting the text into tf-idf vectors
tfidf_train_set = [tfidf_Preprocessing(text , stopwords_list['en']) 
                for text in train_set]
tfidf_test_set = [tfidf_Preprocessing(text , stopwords_list['en'])
                for text in test_set]

In [0]:
# defining the train set, test set and embed dim for word2vec model
word2vec_train_set = [text.split() for text in tfidf_train_set]
word2vec_test_set = [text.split() for text in tfidf_test_set]
n_dim = 300

# defining the skip-gram model for word2vec
word2vec_model = Word2Vec(sg=1, size=n_dim, min_count=1, workers=7)
word2vec_model.build_vocab(word2vec_train_set)

#training the word2vec model on train set and test set to get word embeddings for vocabulary
for epoch in range(20):
    word2vec_model.train(word2vec_train_set ,total_examples=word2vec_model.corpus_count, epochs=5)
for epoch in range(20):
    word2vec_model.train(word2vec_test_set ,total_examples=word2vec_model.corpus_count, epochs=5)

In [0]:
# concatenating the generated word embeddings into sentence embeddings for train and test sets
# and also scaling them.
word2vec_train = np.concatenate([buildWordVector(word2vec_model, text , n_dim) for text in word2vec_train_set])
word2vec_train = scale(word2vec_train)
word2vec_test = np.concatenate([buildWordVector(word2vec_model, text , n_dim) for text in word2vec_test_set])
word2vec_test = scale(word2vec_test)

  
  


In [0]:
# word2vec scaling using max absolute scaler
word2vec_scaled_train_data = max_abs_scaler.fit_transform(word2vec_train)
word2vec_scaled_test_data = max_abs_scaler.transform(word2vec_test)

Fit the vectorized training data to a LinearSVC model, abd predict on the vectorized test data.

In [0]:
# training and predicting using word2vec model (which uses logistic regression)
word2vec_clf = CalibratedClassifierCV(OneVsRestClassifier(LogisticRegression(C=0.01)))
word2vec_clf.fit(word2vec_scaled_train_data, train_labels)
word2vec_predictions = word2vec_clf.predict(word2vec_scaled_test_data)
word2vec_proba = word2vec_clf.predict_proba(word2vec_scaled_test_data)



Compute accuracy.

In [0]:
from sklearn.metrics import accuracy_score
accuracy_score(test_labels, word2vec_predictions)

0.876