In [0]:
from google.colab import drive
drive.mount('/content/drive')
root_dir = 'drive/My Drive/Colab Notebooks/AuthorshipAttribution/'

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import os
import json
import codecs
import operator
import re
import string
import argparse
import numpy as np
from collections import defaultdict
import pickle

In [0]:
import nltk
from nltk.tokenize import word_tokenize, WordPunctTokenizer,PunktSentenceTokenizer, TreebankWordTokenizer
from nltk.corpus import stopwords, webtext
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tag import pos_tag
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# following 3 files need to be downloaded if not already present for this to work.
nltk.download('webtext')
nltk.download('vader_lexicon')
nltk.download('stopwords')



[nltk_data] Downloading package webtext to /root/nltk_data...
[nltk_data]   Unzipping corpora/webtext.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [0]:
from sklearn.preprocessing import scale
from sklearn import utils
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC ,SVC
from sklearn import preprocessing
from sklearn.multiclass import OneVsRestClassifier
from sklearn.calibration import CalibratedClassifierCV

In [0]:
from gensim.models.word2vec import Word2Vec

In [0]:
from sklearn.metrics import accuracy_score

In [0]:
# tokenize based on punctuation using nltk
punk_sent_tokenizer = PunktSentenceTokenizer(webtext.raw('overheard.txt'))

# nltk's built-in sentiment analyzer
vader = SentimentIntensityAnalyzer()

In [0]:
# text pre-processing for the tf-idf model
def tfidf_Preprocessing(text , _stopwords):
    # stemming - using nltk's PorterStemmer
    stemmer = PorterStemmer()
    text = text.replace('-' , '')
    text = text.replace('.' , '')
    text = text.replace('”' , '')
    text = text.replace('’' , '')
    text = text.replace('“' , '')
    text = text.replace('‘' , '')
    text = text.replace('–','')
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    return ' '.join([stemmer.stem(word).lower() for word in WordPunctTokenizer().tokenize(nopunc) 
                    if stemmer.stem(word).lower() not in _stopwords])

In [0]:
# calculating word frequencies for a review text
def tfidf_represent_text(text ):
    tokens = WordPunctTokenizer().tokenize(text)
    frequency = defaultdict(int)
    for token in tokens:
        frequency[token] += 1
    return frequency

In [0]:
# constructing word vocabulary from texts - calculating word frequencies and storing them in dictionaries
def tfidf_extract_vocabulary(texts , ft):
    occurrences=defaultdict(int)
    for text in texts:
        text_occurrences=tfidf_represent_text(text)
        for ngram in text_occurrences:
            if ngram in occurrences:
                occurrences[ngram]+=text_occurrences[ngram]
            else:
                occurrences[ngram]=text_occurrences[ngram]
    vocabulary=[]
    for i in occurrences.keys():
        if occurrences[i]>=ft:
            vocabulary.append(i)
    return vocabulary

In [0]:
# calculating n-gram frequency for a review text
def ngram_represent_text(text,n):
    if n>0:
        tokens = [text[i:i+n] for i in range(len(text)-n+1)]
    frequency = defaultdict(int)
    for token in tokens:
        frequency[token] += 1
    return frequency

In [0]:
# constructing n-gram vocabulary from texts - calculating n-gram frequencies and storing them in dictionaries
def ngram_extract_vocabulary(texts,n,ft):
    occurrences=defaultdict(int)
    for text in texts:
        text_occurrences=ngram_represent_text(text,n)
        for ngram in text_occurrences:
            if ngram in occurrences:
                occurrences[ngram]+=text_occurrences[ngram]
            else:
                occurrences[ngram]=text_occurrences[ngram]
    vocabulary=[]
    for i in occurrences.keys():
        if occurrences[i]>=ft:
            vocabulary.append(i)
    return vocabulary

In [0]:
def buildWordVector(imdb_w2v, text, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in text:
        try:
            vec += imdb_w2v[word].reshape((1, size))
            count += 1.
        except KeyError:
            continue
    if count != 0:
        vec /= count
    return vec

In [0]:
max_abs_scaler = preprocessing.MaxAbsScaler()
stopwords_list = {'en': set(stopwords.words('english')) , 'fr':set(stopwords.words('french')),
                  'sp': set(stopwords.words('spanish')) , 'it':set(stopwords.words('italian'))}

In [0]:
# hyperparameters
pt = 0.08
ft = 5
n = 4

In [0]:
# placeholders for training and testing data
train_set , train_labels = [], []
test_set , test_labels = [] , []

# reading training and testing data from pickle files
train_data, test_data = None, None
with open(root_dir + "/data/train_data.pickle", "rb") as f:
    train_data = pickle.load(f)
with open(root_dir + "/data/test_data.pickle", "rb") as f:
    test_data = pickle.load(f)

# populating training and testing placeholders with data
train_set = train_data['train_texts']
train_labels = train_data['train_labels']
test_set = test_data['test_texts']
test_labels = test_data['test_labels']

In [0]:
def get_chunks(l, n):
    n = max(1, n)
    return [l[i:i+n] for i in range(0, len(l), n)]

from statistics import mean
word_counts = [text.count(" ") for text in test_set]
mean(word_counts)

# 182 words is quite short
# Try to join 5 tests texts together
longer_test_texts = get_chunks(test_set, 5)
longer_test_labels = get_chunks(test_labels, 5)

all([len(set(x)) == 1 for x in longer_test_labels])  # Make sure that all combined labels are the same

test_set = ['\n'.join(chunk) for chunk in longer_test_texts]
test_labels = [chunk[0] for chunk in longer_test_labels]

In [0]:
author_dict = {}
n_parts = 10
for review, author in zip(train_set, train_labels):
    n_chars = len(review) // n_parts
    author_dict[author] = [review[i:i+n_chars] for i in range(0, len(review), n_chars)]

new_train_set, new_train_labels = [], []
for author, reviews in author_dict.items():
    new_train_set.extend(reviews)
    new_train_labels.extend([author] * len(reviews))

# for author, reviews in author_dict.items():
#     print('author: {}\t\tnum_reviews: {}'.format(author, len(reviews)))
# print('len(new_train_set): {}\t\tlen(new_train_labels): {}'.format(len(new_train_set), len(new_train_labels)))

train_set = new_train_set
train_labels = new_train_labels

In [0]:
# converting the text into tf-idf vectors
tfidf_train_set = [tfidf_Preprocessing(text , stopwords_list['en']) 
                for text in train_set]
tfidf_test_set = [tfidf_Preprocessing(text , stopwords_list['en'])
                for text in test_set]

In [0]:
len(test_set)

500

In [0]:
# defining the train set, test set and embed dim for word2vec model
word2vec_train_set = [text.split() for text in tfidf_train_set]
word2vec_test_set = [text.split() for text in tfidf_test_set]
n_dim = 300

# defining the skip-gram model for word2vec
word2vec_model = Word2Vec(sg=1, size=n_dim, min_count=1, workers=7)
word2vec_model.build_vocab(word2vec_train_set)

#training the word2vec model on train set and test set to get word embeddings for vocabulary
for epoch in range(20):
    word2vec_model.train(word2vec_train_set ,total_examples=word2vec_model.corpus_count, epochs=5)
for epoch in range(20):
    word2vec_model.train(word2vec_test_set ,total_examples=word2vec_model.corpus_count, epochs=5)

In [0]:
# concatenating the generated word embeddings into sentence embeddings for train and test sets
# and also scaling them.
word2vec_train = np.concatenate([buildWordVector(word2vec_model, text , n_dim) for text in word2vec_train_set])
word2vec_train = scale(word2vec_train)
word2vec_test = np.concatenate([buildWordVector(word2vec_model, text , n_dim) for text in word2vec_test_set])
word2vec_test = scale(word2vec_test)

  
  


In [0]:
# word2vec scaling using max absolute scaler
word2vec_scaled_train_data = max_abs_scaler.fit_transform(word2vec_train)
word2vec_scaled_test_data = max_abs_scaler.transform(word2vec_test)

In [0]:
# training and predicting using word2vec model (which uses logistic regression)
word2vec_clf = CalibratedClassifierCV(OneVsRestClassifier(LogisticRegression(C=0.01)))
word2vec_clf.fit(word2vec_scaled_train_data, train_labels)
word2vec_predictions = word2vec_clf.predict(word2vec_scaled_test_data)
word2vec_proba = word2vec_clf.predict_proba(word2vec_scaled_test_data)



In [0]:
# training and predicting using tf-idf model (which uses linear SVC)
tfidf_vocab = tfidf_extract_vocabulary(tfidf_train_set , ft )
tfidf_vectorizer = TfidfVectorizer(vocabulary=tfidf_vocab, norm=None, strip_accents=False)

In [0]:
tfidf_train_data = tfidf_vectorizer.fit_transform(tfidf_train_set)
tfidf_test_data = tfidf_vectorizer.fit_transform(tfidf_test_set)

In [0]:
tfidf_scaled_train_data = max_abs_scaler.fit_transform(tfidf_train_data)
tfidf_scaled_test_data = max_abs_scaler.transform(tfidf_test_data)

In [0]:
tfidf_clf = CalibratedClassifierCV(OneVsRestClassifier(LinearSVC(C=0.01)))
tfidf_clf.fit(tfidf_scaled_train_data, train_labels)
tfidf_predictions = tfidf_clf.predict(tfidf_scaled_test_data)
tfidf_proba = tfidf_clf.predict_proba(tfidf_scaled_test_data)



In [0]:
# training and predicting using n-gram model (which uses SVC)
ngram_vocabulary = ngram_extract_vocabulary(train_set , n , ft)
ngram_vectorizer = CountVectorizer(strip_accents=False, analyzer='char',ngram_range=(n,n),lowercase=False,vocabulary=ngram_vocabulary)  
ngram_train_data = ngram_vectorizer.fit_transform(train_set)
ngram_train_data = ngram_train_data.astype(float)

In [0]:
for i in range(len(train_set)):
    ngram_train_data[i]=ngram_train_data[i]/len(train_set[i])
ngram_test_data = ngram_vectorizer.transform(test_set)
ngram_test_data = ngram_test_data.astype(float)
for i in range(len(test_set)):
    ngram_test_data[i] = ngram_test_data[i]/len(test_set[i])

In [0]:
ngram_scaled_train_data = max_abs_scaler.fit_transform(ngram_train_data)
ngram_scaled_test_data = max_abs_scaler.transform(ngram_test_data)

In [0]:
ngram_clf = CalibratedClassifierCV(OneVsRestClassifier(SVC(C=0.01 , kernel='linear')))
ngram_clf.fit(ngram_scaled_train_data, train_labels)
ngram_predictions = ngram_clf.predict(ngram_scaled_test_data)
ngram_proba = ngram_clf.predict_proba(ngram_scaled_test_data)



In [0]:
# building an average ensemble classifier using word2vec_clf, tfidf_clf and ngram_clf
# and selecting the prediction with the highest probability
proba = []
predictions = []
candidates = [i for i in range(50)]
for i in range(0,len(test_set)):
    proba.append((word2vec_proba[i] + ngram_proba[i] + tfidf_proba[i])/3)
    predictions.append(candidates[np.argmax(proba[i])])
# print(predictions)

print(accuracy_score(predictions, test_labels))
# print('\t',count ,'texts left unattributed')
# out_data =[{'unknown-text':unks[i],'predicted-author': predictions[i]} for i in range(len(test_set))]
# Save_json(os.path.join(out_dir , 'answers-'+problem['problem-name']+'.json') , out_data)
# print('\t answers saved to file answers-' + problem['problem-name'] + '.json')
# print("----------------------------------------------------------------")

0.992


In [0]:
accuracy_score(ngram_predictions, test_labels)

0.996

In [0]:
accuracy_score(word2vec_predictions, test_labels)

0.856

In [0]:
accuracy_score(tfidf_predictions, test_labels)

0.958

In [0]:
accuracy_score(predictions, test_labels)

0.992