In [1]:
import os
import re
import numpy as np
from pathlib import Path
from scipy import sparse
from sklearn import linear_model
from collections import Counter
from nltk import sent_tokenize, word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import sklearn.metrics as smt
all_feats={}
auth_dict={}
auth_map={}

In [2]:
def readtext(filepath):
    text=''    
    with open(filepath,'rb') as f:
        text=f.read()
    return text

In [3]:
def featurize(text):
    def get_wordnet_pos(treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return 'a'
    
    lm = WordNetLemmatizer()
    features={}
    voc_count = Counter()
    text = text.lower()
    text = re.sub('"[\S\s]*?"','',text)
    lines = [len(x.split()) for x in sent_tokenize(text)]
    avg_word_pl = sum(lines)/len(lines)    
    l = [len(x.split()) for x in text.split('\n')]
    avg_word_pp = sum(l)/len(l)
    features['avg_word_pp']=avg_word_pp
    features['avg_word_pl']=avg_word_pl
    toks = word_tokenize(text)
    tok_tags = pos_tag(toks)
#     print(toks)
    lemmas=[]
    words=[]
    min_word_depths=Counter()
    max_word_depths=Counter()
    func_words = []
    for word,tag in tok_tags:
        words.append(word)
        lemmas.append(lm.lemmatize(word,get_wordnet_pos(tag)))
        wd_sysnset = wordnet.synsets(word,pos=get_wordnet_pos(tag))
        if len(wd_sysnset)>0:
            min_word_depths[wd_sysnset[0].min_depth()]+=1
            max_word_depths[wd_sysnset[0].max_depth()]+=1
        if(get_wordnet_pos(tag) =='a'):
            func_words.append(word)
            voc_count[word] +=1
    unk_toks = set(lemmas)
    unk_funk = set(func_words)
    
    type_tok_ratio = len(unk_toks)/len(toks)
    funk_freq = len(unk_funk)/len(toks)
    features['type_tok_ratio'] = type_tok_ratio
#     features['type_tok_rat'] = len(set(words))/len(words)
    features['funk_freq'] = funk_freq
    for funk_word in voc_count:
        features[funk_word+'_word_count'] = voc_count[funk_word]
    for min_depth in min_word_depths:
        features['min_word_depth'+str(min_depth)]=min_word_depths[min_depth]
    for max_depth in max_word_depths:
        features['min_word_depth'+str(max_depth)]=max_word_depths[max_depth]    
    return features

In [4]:
def get_train_data(path='./all_c'):
    data=[]
    c=0
#     print(path)
    for entry in os.listdir(path):
        if os.path.isdir(os.path.join(path, entry)):
            text = readtext(path+"/" +entry+'/'+entry+'.txt')
            text = text.decode("utf-8", "replace")
            for article in text.split('\t'):
                if(len(article)>0):
                    data.append((entry,featurize(article)))
            auth_dict[c]=entry
            auth_map[entry]=c
            c+=1
    return data

In [5]:
def get_test_data(path='./all_c'):
    data=[]
    for entry in os.listdir(path):
#         print(entry)
#         print(os.path.join(path, entry+'/test'))
        if os.path.isdir(os.path.join(path, entry+'/test')):
#             print(Path(path+'\\'+entry+'\\test'))
            for tst_file in os.listdir(path+'/'+entry+'/test'):
                if(tst_file.endswith('.txt')):
                    text = readtext(path+'/'+entry+'/test/'+tst_file)
                    text = text.decode("utf-8", "replace")
                    for article in text.split('\t'):
                        if(len(article)>0):
                            data.append((entry,featurize(article)))
    return data

In [6]:
#print(get_train_data())

In [7]:
#print(get_test_data())

In [8]:
def create_model():
    global all_feats
    fid=0
    data = get_train_data()
    for (label,feats) in data:
        for feat in feats:
            if feat not in all_feats:
                all_feats[feat] = fid
                fid+=1
    D = len(data)
    F = len(all_feats)
    X = sparse.dok_matrix((D,F))
    Y = np.zeros(len(data))
    for idx,(label,feats) in enumerate(data):
        for feat in feats:
            X[idx,all_feats[feat]] = feats[feat]
            Y[idx] = auth_map[label]
    logreg = linear_model.LogisticRegression(C=0.01)
    logreg.fit(X,Y)
#     print(logreg.score(X,Y))
    return(logreg)

In [9]:
# lg=create_model()

In [10]:
def evaluate():
    test_data = get_test_data()
    lm = create_model()
    D = len(test_data)
    F = len(all_feats)
    X = sparse.dok_matrix((D,F))
    Y = np.arange(len(test_data),dtype=int)
    true_labels=[]
    for idx,(label,feats) in enumerate(test_data):
        true_labels.append(auth_map[label])
        for feat in feats:
            if feat in all_feats:
                X[idx,all_feats[feat]] = feats[feat]
    
    preds=lm.predict(X)    
    p,r,f1, _ = smt.precision_recall_fscore_support(true_labels,preds)
    for i in range(len(auth_dict)):
        print('Author',auth_dict[i])
        print('precision',round(p[i],4))
        print('recall',round(r[i],4))
        print('f1score',round(f1[i],4))
        print()
    for i in range(len(true_labels)):
        print('true author:',auth_dict[true_labels[i]])
        print('predicted author:',auth_dict[preds[i]])
        print()

In [None]:
evaluate()

In [None]:
print(auth_dict)
print(auth_map)

In [None]:
all_feats