In [1]:
import os
import re
import numpy as np
from pathlib import Path
from scipy import sparse
from sklearn import linear_model
from collections import Counter
from nltk import sent_tokenize, word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import sklearn.metrics as smt
all_feats={}
auth_dict={}
auth_map={}
auth_models={}

In [4]:
def readtext(filepath):
    text=''    
    with open(filepath,'rb') as f:
        text=f.read()
    return text

In [3]:
def featurize(text):
    def get_wordnet_pos(treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return 'a'
    
    lm = WordNetLemmatizer()
    features={}
    voc_count = Counter()
    text = text.lower()
    text = re.sub('"[\S\s]*?"','',text)
    lines = [len(x.split()) for x in sent_tokenize(text)]
    avg_word_pl = sum(lines)/len(lines)    
    l = [len(x.split()) for x in text.split('\n')]
    avg_word_pp = sum(l)/len(l)
    features['avg_word_pp']=avg_word_pp
    features['avg_word_pl']=avg_word_pl
    toks = word_tokenize(text)
    tok_tags = pos_tag(toks)
#     print(toks)
    lemmas=[]
    words=[]
    min_word_depths=Counter()
    max_word_depths=Counter()
    func_words = []
    for word,tag in tok_tags:
        words.append(word)
        lemmas.append(lm.lemmatize(word,get_wordnet_pos(tag)))
        wd_sysnset = wordnet.synsets(word,pos=get_wordnet_pos(tag))
        if len(wd_sysnset)>0:
            min_word_depths[wd_sysnset[0].min_depth()]+=1
            max_word_depths[wd_sysnset[0].max_depth()]+=1
        if(get_wordnet_pos(tag) =='a'):
            func_words.append(word)
            voc_count[word] +=1
    unk_toks = set(lemmas)
    unk_funk = set(func_words)
    
    type_tok_ratio = len(unk_toks)/len(toks)
    funk_freq = len(unk_funk)/len(toks)
    features['type_tok_ratio'] = type_tok_ratio
#     features['type_tok_rat'] = len(set(words))/len(words)
    features['funk_freq'] = funk_freq
    for funk_word in voc_count:
        features[funk_word+'_word_count'] = voc_count[funk_word]
    for min_depth in min_word_depths:
        features['min_word_depth'+str(min_depth)]=min_word_depths[min_depth]
    for max_depth in max_word_depths:
        features['min_word_depth'+str(max_depth)]=max_word_depths[max_depth]    
    return features

In [6]:
def get_train_data(path='./all_c'):
    data=[]
    c=0
#     print(path)
    for entry in os.listdir(path):
        if os.path.isdir(os.path.join(path, entry)):
            text = readtext(path+"/" +entry+'/'+entry+'.txt')
            text = text.decode("utf-8", "replace")
            print([x.split() for x in sent_tokenize(text)])
            break
            for article in text.split('\t'):
                if(len(article)>0):
                    pass
#                     data.append((entry,featurize(article)))
            auth_dict[c]=entry
            auth_map[entry]=c
            c+=1
    return data

In [7]:
get_train_data()

[['As', 'the', 'baby', 'boomers', 'age,', 'more', 'and', 'more', 'Americans', 'will', 'either', 'be', 'enduring', 'chronic', 'pain', 'or', 'taking', 'care', 'of', 'someone', 'in', 'pain.'], ['The', 'Republican', 'Party', 'has', 'been', 'reaching', 'out', 'to', 'them', 'with', 'a', 'two-step', 'plan:', '1.'], ['Do', 'not', 'give', 'patients', 'medicine', 'to', 'ease', 'their', 'pain.'], ['2.'], ['If', 'they', 'are', 'in', 'great', 'pain', 'and', 'near', 'death,', 'do', 'not', 'let', 'them', 'put', 'an', 'end', 'to', 'their', 'misery.'], ['The', 'Republicans', 'have', 'been', 'so', 'determined', 'to', 'become', 'the', 'Pain', 'Party', 'that', "they've", 'brushed', 'aside', 'their', 'traditional', 'belief', 'in', "states'", 'rights.'], ['The', 'Bush', 'administration', 'wants', 'lawyers', 'in', 'Washington', 'and', 'federal', 'prosecutors', 'with', 'no', 'medical', 'training', 'to', 'tell', 'doctors', 'how', 'to', 'treat', 'patients.'], ['As', 'attorney', 'general,', 'John', 'Ashcroft', '

[]

In [5]:
def get_test_data(path='./all_c'):
    data=[]
    for entry in os.listdir(path):
#         print(entry)
#         print(os.path.join(path, entry+'/test'))
        if os.path.isdir(os.path.join(path, entry+'/test')):
#             print(Path(path+'\\'+entry+'\\test'))
            for tst_file in os.listdir(path+'/'+entry+'/test'):
                if(tst_file.endswith('.txt')):
                    text = readtext(path+'/'+entry+'/test/'+tst_file)
                    text = text.decode("utf-8", "replace")
                    for article in text.split('\t'):
                        if(len(article)>0):
                            data.append((entry,featurize(article)))
    return data

In [6]:
#print(get_train_data())

In [7]:
#print(get_test_data())

In [9]:
# lg=create_model()

[9, 0, 1, 7, 1, 2, 3, 2, 3, 3, 4, 4, 5, 5, 5, 10, 5, 7, 7, 5, 8, 7, 8, 9, 9, 2, 1]
[0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 10, 10]

true_label 0
[0.26611178 0.00453514 0.00407061 0.00386069 0.00957406 0.00493223
 0.00486716 0.00590099 0.12359651 0.53163671 0.04091414]
true_label 0
[0.55212557 0.00716817 0.00674045 0.00907956 0.02163855 0.00100753
 0.01368293 0.00508058 0.03879159 0.30832    0.03636507]
true_label 1
[0.00703536 0.38256152 0.02270463 0.08260056 0.22017464 0.04300634
 0.09410314 0.00715273 0.01680035 0.08986367 0.03399707]
true_label 1
[0.0289821  0.29702401 0.05649812 0.00357808 0.01618085 0.02063079
 0.00784509 0.35994868 0.17963563 0.01662414 0.0130525 ]
true_label 1
[2.34556568e-02 7.77742539e-01 8.94237267e-03 3.99852497e-03
 1.55121969e-01 5.77720797e-03 6.71462503e-04 6.00580434e-03
 6.73611284e-03 9.87572085e-03 1.67262989e-03]
true_label 2
[2.20802466e-03 2.26195351e-02 9.07474496e-01 9.27236322e-04
 1.16794937e-03 5.57911707e-



{'avg f1': 0.61}