In [1]:
import os
import re
import numpy as np
import sklearn.metrics as smt
from pathlib import Path
from scipy import sparse
from sklearn import linear_model
from collections import Counter
from nltk import sent_tokenize, word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from gensim.models import Word2Vec as w2v
from gensim.models import KeyedVectors

all_feats={}
auth_dict={}
auth_map={}
auth_models={}

l_c=1
path='./all_c'


In [2]:
def readtext(filepath):
    text=''    
    with open(filepath,'rb') as f:
        text=f.read()
    return text

In [29]:
def featurize(text):
    def get_wordnet_pos(treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return 'a'
    punkts=[',','.',';','!',':','?',"''","'"]
    lm = WordNetLemmatizer()
    features={}
    voc_count = Counter()
    text = text.lower()
    text = re.sub('"[\S\s]*?"','',text)
    lines = [len(x.split()) for x in sent_tokenize(text)]
    avg_word_pl = sum(lines)/len(lines)    
    l = [len(x.split()) for x in text.split('\n')]
    avg_word_pp = sum(l)/len(l)
    features['avg_word_pp']=avg_word_pp
    features['avg_word_pl']=avg_word_pl
    toks = word_tokenize(text)
    tok_tags = pos_tag(toks)
#     print(toks)
    lemmas=[]
    words=[]
    sen=[x.split() for x in sent_tokenize(text)]
    model = w2v(sen,workers=4,sg=1,window=2,iter=30,size=25,alpha=0.5,min_count=1)
    min_word_depths=Counter()
    max_word_depths=Counter()
    func_words = []
    for word,tag in tok_tags:
        if word not in punkts:
            words.append(word)
            lemmas.append(lm.lemmatize(word,get_wordnet_pos(tag)))
            wd_sysnset = wordnet.synsets(word,pos=get_wordnet_pos(tag))
            if len(wd_sysnset)>0:
                min_word_depths[wd_sysnset[0].min_depth()]+=1
                max_word_depths[wd_sysnset[0].max_depth()]+=1
            if(get_wordnet_pos(tag) =='a' or get_wordnet_pos(tag) == wordnet.ADV):
                func_words.append(word)
                voc_count[word] +=1
    unk_toks = set(lemmas)
    unk_funk = set(func_words)
    
    f_word_count=[(k,v) for (k,v) in voc_count.items() if k in model.wv.vocab.keys()]
    f_word_count.sort(key=lambda x: x[1],reverse=True)
    av_distance=0
    most_freq=[k for k,_ in f_word_count[:50]]
    lest_freq=[k for k,_ in f_word_count[100:]]
    for w in most_freq:
        av_distance+=sum(model.wv.distances(w,model.wv.vocab.keys()))/len(model.wv.vocab.keys())
    av_distance/=len(text.split())/len(func_words)
    features['av_dist']=av_distance
#     print('av_dist',av_distance)
    
    
    type_tok_ratio = len(unk_toks)/len(toks)
    funk_freq = len(unk_funk)/len(toks)
    features['type_tok_ratio'] = type_tok_ratio
#     features['type_tok_rat'] = len(set(words))/len(words)
    features['funk_freq'] = funk_freq
    for funk_word in voc_count:
        features[funk_word+'_word_count'] = voc_count[funk_word]
    for min_depth in min_word_depths:
        features['min_word_depth'+str(min_depth)]=min_word_depths[min_depth]
    for max_depth in max_word_depths:
        features['min_word_depth'+str(max_depth)]=max_word_depths[max_depth]    
    return features

In [4]:
def find_av(text):
    def get_wordnet_pos(treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return 'a'
    punkts=[',','.',';','!',':','?',"''","'"]
    voc_count = Counter()
    text = text.lower()
    text = re.sub('"[\S\s]*?"','',text)
    toks = word_tokenize(text)
    tok_tags = pos_tag(toks)
    sen=[x.split() for x in sent_tokenize(text)]
    model = w2v(sen,workers=4,sg=0,window=4,iter=50,size=25,alpha=0.75,min_count=1)
    func_words = []
    for word,tag in tok_tags:
        if word not in punkts:
            if(get_wordnet_pos(tag) =='a' or get_wordnet_pos(tag) == wordnet.ADV):
                func_words.append(word)
                voc_count[word] +=1
    
    f_word_count=[(k,v) for (k,v) in voc_count.items() if k in model.wv.vocab.keys()]
    f_word_count.sort(key=lambda x: x[1],reverse=True)
    av_distance=0
    most_freq=[k for k,_ in f_word_count[:100]]
    lest_freq=[k for k,_ in f_word_count[100:]]
    for w in most_freq:
        av_distance+=sum(model.wv.distances(w,most_freq))/len(most_freq)
    av_distance/=len(text.split())/len(func_words)
    return({'av_dist':av_distance})


In [5]:
def get_auth_model_av_dist():
    def get_wordnet_pos(treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return 'a'
    punkts=[',',';','!','.',':','?',"'","''"]
    for entry in os.listdir(path):
        if os.path.isdir(os.path.join(path, entry)):
            text = readtext(path+"/" +entry+'/'+entry+'.txt')
            text = text.decode("utf-8", "replace")
            text=text.lower()
            sen=[x.split() for x in sent_tokenize(text)]
            model = w2v(sen,workers=4,sg=0,window=5,iter=30,size=25,alpha=0.95,min_count=1)
            f_word_vecs=Counter()
            toks = word_tokenize(text)
            tok_tags = pos_tag(toks)
#             print(tok_tags)
            for word,tag in tok_tags:
                if word not in punkts:
                    if(get_wordnet_pos(tag)=='a' or get_wordnet_pos(tag)==wordnet.ADV):
#                         print(word)
                        f_word_vecs[word]+=1
            
            f_word_count=[(k,v) for (k,v) in f_word_vecs.items() if k in model.wv.vocab.keys()]
            f_word_count.sort(key=lambda x: x[1],reverse=True)
            av_distance=0
            most_freq=[k for k,_ in f_word_count[:100]]
            lest_freq=[k for k,_ in f_word_count[100:]]
#             print(most_freq)
            for w in most_freq:
#                 print(model.wv.distances(w,most_freq))
                av_distance+=sum(model.wv.distances(w,most_freq))/len(most_freq)
            av_distance/=len(text.split())/len(f_word_vecs)
#             print('av_distance',av_distance,entry)
            auth_models[entry]=av_distance

In [6]:
#get_auth_model_av_dist()

In [7]:
def get_train_data():
    get_auth_model_av_dist() # load author av_dist 
    data=[]
    c=0
#     print(path)
    for entry in os.listdir(path):
        av_dist=[]
        if os.path.isdir(os.path.join(path, entry)):
            text = readtext(path+"/" +entry+'/'+entry+'.txt')
            text = text.decode("utf-8", "replace")
            for article in text.split('\t'):
                if(len(article)>0):
                    feats = featurize(article)
#                     feats.update({'av_dist':auth_models[entry]})
                    data.append((entry,feats))
                    av_dist += [feats['av_dist']]
            print('auth',entry,'av',sum(av_dist)/len(av_dist))
            print(av_dist)
            print()
            auth_dict[c]=entry
            auth_map[entry]=c
            c+=1
    return data

In [8]:
def get_test_data():
    data=[]
    for entry in os.listdir(path):
#         print(entry)
#         print(os.path.join(path, entry+'/test'))
        if os.path.isdir(os.path.join(path, entry+'/test')):
#             print(Path(path+'\\'+entry+'\\test'))
            for tst_file in os.listdir(path+'/'+entry+'/test'):
                if(tst_file.endswith('.txt')):
                    text = readtext(path+'/'+entry+'/test/'+tst_file)
                    text = text.decode("utf-8", "replace")
                    for article in text.split('\t'):
                        if(len(article)>0):
                            feats = featurize(article)
#                             feats.update(find_av(article))
                            data.append((entry,feats))
    return data

In [9]:
#print(get_train_data())

In [10]:
#print(get_test_data())

In [11]:
def create_model(l2_p):
    global all_feats
    fid=0
    data = get_train_data()
    for (label,feats) in data:
        for feat in feats:
            if feat not in all_feats:
                all_feats[feat] = fid
                fid+=1
    D = len(data)
    F = len(all_feats)
    X = sparse.dok_matrix((D,F))
    Y = np.zeros(len(data))
    for idx,(label,feats) in enumerate(data):
        for feat in feats:
            X[idx,all_feats[feat]] = feats[feat]
            Y[idx] = auth_map[label]
    logreg = linear_model.LogisticRegression(C=l2_p)
    logreg.fit(X,Y)
#     print(logreg.score(X,Y))
    return(logreg)

In [12]:
# lg=create_model()

In [13]:
def evaluate(l2_p):
    test_data = get_test_data()
    lm = create_model(l2_p)
    D = len(test_data)
    F = len(all_feats)
    X = sparse.dok_matrix((D,F))
    Y = np.arange(len(test_data),dtype=int)
    true_labels=[]
    for idx,(label,feats) in enumerate(test_data):
        true_labels.append(auth_map[label])
        for feat in feats:
            if feat in all_feats:
                X[idx,all_feats[feat]] = feats[feat]
    
    preds=lm.predict(X)
    probs = lm.predict_proba(X)
    idx=0
    
    p,r,f1, _ = smt.precision_recall_fscore_support(true_labels,preds,warn_for=())
    preds = [int(x) for x in preds]
    print(preds)
    print(true_labels)
    print()
#     for prob in probs:
#         print('true_label',true_labels[idx])
#         print(prob)
#         idx+=1
#     print()
    w_c=0
    for i in range(len(preds)):
        if preds[i]!=true_labels[i]:
            w_c+=1
    print(w_c,'incorrect out of',len(preds))
    print()
#     for i in range(len(auth_dict)):
#         print('Author',auth_dict[i])
#         print('precision',round(p[i],4))
#         print('recall',round(r[i],4))
#         print('f1score',round(f1[i],4))
#         print()
#     for i in range(len(true_labels)):
#         print('true author:',auth_dict[true_labels[i]])
#         print('predicted author:',auth_dict[preds[i]])
#         print()
    av_f1={'avg f1':round(sum(f1)/len(f1),2)}
    print(av_f1)
    print('-----------------------')
    return(av_f1)

In [30]:
evaluate(1)

auth John Tierney av 19.670161673509323
[19.186731402056576, 19.748558134665842, 20.04268476888552, 19.69989631955671, 19.186352039987852, 20.156747375903418]

auth Simon Romero av 18.513901642997975
[19.125739642324536, 18.917961177077782, 18.30419418615027, 19.008030748205954, 18.19860152295183, 18.434200184636012, 18.44011992880339, 17.37511508845905, 18.821152308372966]

auth Adam Liptak av 18.139009238667708
[19.57058957125566, 17.047372185160146, 18.54357440733995, 18.651155480780357, 18.225155712955225, 18.66138640886046, 17.49591010642487, 18.270515400755805, 16.7854238744769]

auth Alessandra Stanley av 20.550210801147607
[19.56492011817019, 19.925933876172554, 20.736279775360522, 21.420354078916006, 20.37753967971675, 21.27623727854962]

auth Terry Pristin av 18.452151757008767
[17.499867743654033, 19.40384448042395, 17.807047805800828, 19.44203207486707, 18.149475729859912, 18.410642707446826]

auth Robert Pear av 17.74031579421867
[17.7612365448706, 17.577805655348364, 17.3



{'avg f1': 0.64}

In [15]:
# l_cs = np.arange(0.01,0.03,0.01)
# res = []
# for lc in l_cs:
#     res+=evaluate(lc)
# print('best penalty',l_cs[res.index(max(res))])

In [16]:
print(auth_map)

{'John Tierney': 0, 'Simon Romero': 1, 'Adam Liptak': 2, 'Alessandra Stanley': 3, 'Terry Pristin': 4, 'Robert Pear': 5, 'Sarah Kreshaw': 6, 'Adam Nagoury': 7, 'Maureen Dowd': 8, 'Nicholas': 9, 'Alan Ridding': 10}
