In [15]:
import os
import re
import numpy as np
from pathlib import Path
from scipy import sparse
from sklearn import linear_model
from collections import Counter
from nltk import sent_tokenize, word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import sklearn.metrics as smt
from gensim.models import Word2Vec as w2v
from gensim.models import KeyedVectors
all_feats={}
auth_dict={}
auth_map={}
auth_models={}

In [4]:
def readtext(filepath):
    text=''    
    with open(filepath,'rb') as f:
        text=f.read()
    return text

In [3]:
def featurize(text):
    def get_wordnet_pos(treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return 'a'
    
    lm = WordNetLemmatizer()
    features={}
    voc_count = Counter()
    text = text.lower()
    text = re.sub('"[\S\s]*?"','',text)
    lines = [len(x.split()) for x in sent_tokenize(text)]
    avg_word_pl = sum(lines)/len(lines)    
    l = [len(x.split()) for x in text.split('\n')]
    avg_word_pp = sum(l)/len(l)
    features['avg_word_pp']=avg_word_pp
    features['avg_word_pl']=avg_word_pl
    toks = word_tokenize(text)
    tok_tags = pos_tag(toks)
#     print(toks)
    lemmas=[]
    words=[]
    min_word_depths=Counter()
    max_word_depths=Counter()
    func_words = []
    for word,tag in tok_tags:
        words.append(word)
        lemmas.append(lm.lemmatize(word,get_wordnet_pos(tag)))
        wd_sysnset = wordnet.synsets(word,pos=get_wordnet_pos(tag))
        if len(wd_sysnset)>0:
            min_word_depths[wd_sysnset[0].min_depth()]+=1
            max_word_depths[wd_sysnset[0].max_depth()]+=1
        if(get_wordnet_pos(tag) =='a'):
            func_words.append(word)
            voc_count[word] +=1
    unk_toks = set(lemmas)
    unk_funk = set(func_words)
    
    type_tok_ratio = len(unk_toks)/len(toks)
    funk_freq = len(unk_funk)/len(toks)
    features['type_tok_ratio'] = type_tok_ratio
#     features['type_tok_rat'] = len(set(words))/len(words)
    features['funk_freq'] = funk_freq
    for funk_word in voc_count:
        features[funk_word+'_word_count'] = voc_count[funk_word]
    for min_depth in min_word_depths:
        features['min_word_depth'+str(min_depth)]=min_word_depths[min_depth]
    for max_depth in max_word_depths:
        features['min_word_depth'+str(max_depth)]=max_word_depths[max_depth]    
    return features

In [54]:
def get_train_data(path='./all_c'):
    data=[]
    c=0
#     print(path)
    for entry in os.listdir(path):
        if os.path.isdir(os.path.join(path, entry)):
            text = readtext(path+"/" +entry+'/'+entry+'.txt')
            text = text.decode("utf-8", "replace")
#             print(sent_tokenize(text))
            print(entry)
            toks = word_tokenize(text)
            tok_tags = pos_tag(toks)
            print(tok_tags)
            break
            for article in text.split('\t'):
                if(len(article)>0):                    
                    pass
            auth_dict[c]=entry
            auth_map[entry]=c
            c+=1
            
#             auth_models[entry]=model
    return data

In [None]:
def get_auth_model_vecs(text):
    text=text.lower()
    sen=[x.split() for x in sent_tokenize(text)]
    model = w2v(sen,workers=4,sg=1,window=5,iter=30,size=25,alpha=0.5,min_count=2)
    f_word_vecs=Counter()
    toks = word_tokenize(text)
    tok_tags = pos_tag(toks)
    for word,tag in tok_tags():
        if word not in f_word_vecs:
            if(tag=='a' or tag==wordnet.ADV):
                 f_word_vecs[word]+=1
    f_count = [(k,v) for (k,v) in f_word_vecs.items()]
    return(f_word_vecs)

In [None]:
def get_test_model_vecs(text):
    text=text.lower()
    sen=[x.split() for x in sent_tokenize(text)]
    model = w2v(sen,workers=4,sg=1,window=5,iter=100,size=25,alpha=0.5,min_count=2)
    f_word_vecs={}
    toks = word_tokenize(text)
    tok_tags = pos_tag(toks)
    for word,tag in tok_tags():
        if word not in f_word_vecs:
            if(tag=='a' or tag==wordnet.ADV):
                 f_word_vecs[word]=model.wv[word]
    return(f_word_vecs)

In [55]:
get_train_data()

John Tierney
[('As', 'IN'), ('the', 'DT'), ('baby', 'NN'), ('boomers', 'NNS'), ('age', 'NN'), (',', ','), ('more', 'JJR'), ('and', 'CC'), ('more', 'JJR'), ('Americans', 'NNS'), ('will', 'MD'), ('either', 'RB'), ('be', 'VB'), ('enduring', 'VBG'), ('chronic', 'JJ'), ('pain', 'NN'), ('or', 'CC'), ('taking', 'VBG'), ('care', 'NN'), ('of', 'IN'), ('someone', 'NN'), ('in', 'IN'), ('pain', 'NN'), ('.', '.'), ('The', 'DT'), ('Republican', 'NNP'), ('Party', 'NNP'), ('has', 'VBZ'), ('been', 'VBN'), ('reaching', 'VBG'), ('out', 'IN'), ('to', 'TO'), ('them', 'PRP'), ('with', 'IN'), ('a', 'DT'), ('two-step', 'JJ'), ('plan', 'NN'), (':', ':'), ('1', 'CD'), ('.', '.'), ('Do', 'VB'), ('not', 'RB'), ('give', 'VB'), ('patients', 'NNS'), ('medicine', 'VB'), ('to', 'TO'), ('ease', 'VB'), ('their', 'PRP$'), ('pain', 'NN'), ('.', '.'), ('2', 'CD'), ('.', '.'), ('If', 'IN'), ('they', 'PRP'), ('are', 'VBP'), ('in', 'IN'), ('great', 'JJ'), ('pain', 'NN'), ('and', 'CC'), ('near', 'IN'), ('death', 'NN'), (',', '

[]

In [5]:
def get_test_data(path='./all_c'):
    data=[]
    for entry in os.listdir(path):
#         print(entry)
#         print(os.path.join(path, entry+'/test'))
        if os.path.isdir(os.path.join(path, entry+'/test')):
#             print(Path(path+'\\'+entry+'\\test'))
            for tst_file in os.listdir(path+'/'+entry+'/test'):
                if(tst_file.endswith('.txt')):
                    text = readtext(path+'/'+entry+'/test/'+tst_file)
                    text = text.decode("utf-8", "replace")
                    for article in text.split('\t'):
                        if(len(article)>0):
                            data.append((entry,featurize(article)))
    return data

In [6]:
#print(get_train_data())

In [7]:
#print(get_test_data())

In [9]:
# lg=create_model()

In [48]:
for entry in auth_models:
    print(auth_models[entry].wv.get_vector('the'))

[-2.2165496  -0.8362947  -2.4786072   2.9825795  -0.3515947   2.6692922
  1.0709597  -1.9138005  -3.982295   -0.30632    -8.614086   -3.911534
 -2.3926356   0.90972894 -4.425423    3.4164813   2.1531744   5.603564
 -5.794552   -5.6073055   2.9156382  -2.8862238  -5.1438212  -1.8536321
 -0.98020303]
[-3.6395376   3.5239484  -2.413028   -0.01523563  2.1680498  -6.9116898
 -2.6881216   0.17481409 -2.2968683  -1.8438929   4.9487815  -5.5921187
 -6.7270637  -2.163019    4.499947   -0.8773965   2.1433742   2.1526945
 -0.10759402  7.5083613   1.1696159  -9.626397    6.068465    5.847292
 -2.6754718 ]
[ -1.030588    1.5819476   4.2157173  -1.8481706  -5.8749976  -0.7918426
   5.1927943 -12.421817    6.2689457  -2.0081372   1.3315518   1.3670633
   2.526204   -0.4596413  -1.3655807   4.0971856  -6.1675997   4.4253874
  -5.1282477   1.3306744   5.82034   -14.081058    4.5760546   6.45884
  -1.4913276]
[  2.6816773   -6.0603666    3.6562374   -1.6355301   -0.15821932
   0.36337078   1.2788473    

In [31]:
print(len(auth_dict))

0


In [58]:
import numpy as np
np.arange(0.01,1,0.01)

array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 , 0.11,
       0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21, 0.22,
       0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32, 0.33,
       0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43, 0.44,
       0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54, 0.55,
       0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65, 0.66,
       0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76, 0.77,
       0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87, 0.88,
       0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99])