In [1]:
import collections as coll
import math
import pickle
import string

import matplotlib.pyplot as plt
import numpy as np
from matplotlib import style
from nltk.corpus import cmudict
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

import nltk
import json


nltk.download('cmudict')
nltk.download('stopwords')

style.use("ggplot")
cmuDictionary = None



# ---------------------------------------------------------------------

def syllable_count_Manual(word):
    word = word.lower()
    count = 0
    vowels = "aeiouy"
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
            if word.endswith("e"):
                count -= 1
    if count == 0:
        count += 1
    return count


# ---------------------------------------------------------------------
# COUNTS NUMBER OF SYLLABLES

def syllable_count(word):
    global cmuDictionary
    d = cmuDictionary
    try:
        syl = [len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]][0]
    except:
        syl = syllable_count_Manual(word)
    return syl

    # ----------------------------------------------------------------------------


# removing stop words plus punctuation.
def Avg_wordLength(str):
    str.translate(string.punctuation)
    tokens = word_tokenize(str, language='english')
    st = [",", ".", "'", "!", '"', "#", "$", "%", "&", "(", ")", "*", "+", "-", ".", "/", ":", ";", "<", "=", '>', "?",
          "@", "[", "\\", "]", "^", "_", '`', "{", "|", "}", '~', '\t', '\n']
    stop = stopwords.words('english') + st
    words = [word for word in tokens if word not in stop]
    return np.average([len(word) for word in words])


# ----------------------------------------------------------------------------


# returns avg number of characters in a sentence
def Avg_SentLenghtByCh(text):
    tokens = sent_tokenize(text)
    return np.average([len(token) for token in tokens])


# ----------------------------------------------------------------------------

# returns avg number of words in a sentence
def Avg_SentLenghtByWord(text):
    tokens = sent_tokenize(text)
    return np.average([len(token.split()) for token in tokens])


# ----------------------------------------------------------------------------


# GIVES NUMBER OF SYLLABLES PER WORD
def Avg_Syllable_per_Word(text):
    tokens = word_tokenize(text, language='english')
    st = [",", ".", "'", "!", '"', "#", "$", "%", "&", "(", ")", "*", "+", "-", ".", "/", ":", ";", "<", "=", '>', "?",
          "@", "[", "\\", "]", "^", "_", '`', "{", "|", "}", '~', '\t', '\n']
    stop = stopwords.words('english') + st
    words = [word for word in tokens if word not in stop]
    syllabls = [syllable_count(word) for word in words]
    p = (" ".join(words))
    return sum(syllabls) / max(1, len(words))


# -----------------------------------------------------------------------------

# COUNTS SPECIAL CHARACTERS NORMALIZED OVER LENGTH OF CHUNK
def CountSpecialCharacter(text):
    st = ["#", "$", "%", "&", "(", ")", "*", "+", "-", "/", "<", "=", '>',
          "@", "[", "\\", "]", "^", "_", '`', "{", "|", "}", '~', '\t', '\n']
    count = 0
    for i in text:
        if (i in st):
            count = count + 1
    return count / len(text)


# ----------------------------------------------------------------------------

def CountPuncuation(text):
    st = [",", ".", "'", "!", '"', ";", "?", ":", ";"]
    count = 0
    for i in text:
        if (i in st):
            count = count + 1
    return float(count) / float(len(text))


# ----------------------------------------------------------------------------
# RETURNS NORMALIZED COUNT OF FUNCTIONAL WORDS FROM A Framework for
# Authorship Identification of Online Messages: Writing-Style Features and Classification Techniques

def CountFunctionalWords(text):
    functional_words = """a between in nor some upon
    about both including nothing somebody us
    above but inside of someone used
    after by into off something via
    all can is on such we
    although cos it once than what
    am do its one that whatever
    among down latter onto the when
    an each less opposite their where
    and either like or them whether
    another enough little our these which
    any every lots outside they while
    anybody everybody many over this who
    anyone everyone me own those whoever
    anything everything more past though whom
    are few most per through whose
    around following much plenty till will
    as for must plus to with
    at from my regarding toward within
    be have near same towards without
    because he need several under worth
    before her neither she unless would
    behind him no should unlike yes
    below i nobody since until you
    beside if none so up your
    """

    functional_words = functional_words.split()
    words = RemoveSpecialCHs(text)
    count = 0

    for i in text:
        if i in functional_words:
            count += 1

    return count / len(words)


# ---------------------------------------------------------------------------

# also returns Honore Measure R
#s (Honore´, 1979, quoted in Holmes and Singh, 1996)
def hapaxLegemena(text):
    words = RemoveSpecialCHs(text)
    V1 = 0
    # dictionary comprehension . har word kay against value 0 kardi
    freqs = {key: 0 for key in words}
    for word in words:
        freqs[word] += 1
    for word in freqs:
        if freqs[word] == 1:
            V1 += 1
    N = len(words)
    V = float(len(set(words)))
    R = 100 * math.log(N) / max(1, (1 - (V1 / V)))
    h = V1 / N
    return R, h




# --------------------------------------------------------------------------
# TYPE TOKEN RATIO NO OF DIFFERENT WORDS / NO OF WORDS
def typeTokenRatio(text):
    words = word_tokenize(text)
    return len(set(words)) / len(words)


# ------------------------------------------------------------------------
def RemoveSpecialCHs(text):
    text = word_tokenize(text)
    st = [",", ".", "'", "!", '"', "#", "$", "%", "&", "(", ")", "*", "+", "-", ".", "/", ":", ";", "<", "=", '>', "?",
          "@", "[", "\\", "]", "^", "_", '`', "{", "|", "}", '~', '\t', '\n']

    words = [word for word in text if word not in st]
    return words


# -------------------------------------------------------------------------
# K  10,000 * (M - N) / N**2
# , where M  Sigma i**2 * Vi.
def YulesCharacteristicK(text):
    words = RemoveSpecialCHs(text)
    N = len(words)
    freqs = coll.Counter()
    freqs.update(words)
    vi = coll.Counter()
    vi.update(freqs.values())
    M = sum([(value * value) * vi[value] for key, value in freqs.items()])
    K = 10000 * (M - N) / math.pow(N, 2)
    return K


# -------------------------------------------------------------------------


# -1*sigma(pi*lnpi)
# Shannon and sympsons index are basically diversity indices for any community
def ShannonEntropy(text):
    words = RemoveSpecialCHs(text)
    lenght = len(words)
    freqs = coll.Counter()
    freqs.update(words)
    arr = np.array(list(freqs.values()))
    distribution = 1. * arr
    distribution /= max(1, lenght)
    import scipy as sc
    H = sc.stats.entropy(distribution, base=2)
    # H = sum([(i/lenght)*math.log(i/lenght,math.e) for i in freqs.values()])
    return H


# ------------------------------------------------------------------
# 1 - (sigma(n(n - 1))/N(N-1)
# N is total number of words
# n is the number of each type of word
def SimpsonsIndex(text):
    words = RemoveSpecialCHs(text)
    freqs = coll.Counter()
    freqs.update(words)
    N = len(words)
    n = sum([1.0 * i * (i - 1) for i in freqs.values()])
    D = 1 - (n / (N * (N - 1)))
    return D


# ------------------------------------------------------------------

def FleschReadingEase(text, NoOfsentences ):
    words = RemoveSpecialCHs(text)
    l = float(len(words))
    scount = 0
    for word in words:
        scount += syllable_count(word)

    I = 206.835 - 1.015 * (l / float(NoOfsentences)) - 84.6 * (scount / float(l))
    return I


# -------------------------------------------------------------------
def FleschCincadeGradeLevel(text, NoOfSentences):
    words = RemoveSpecialCHs(text)
    scount = 0
    for word in words:
        scount += syllable_count(word)

    l = len(words)
    F = 0.39 * (l / NoOfSentences) + 11.8 * (scount / float(l)) - 15.59
    return F


# -----------------------------------------------------------------
def dale_chall_readability_formula(text, NoOfSectences):
    words = RemoveSpecialCHs(text)
    difficult = 0
    adjusted = 0
    NoOfWords = len(words)
    with open('dale-chall.pkl', 'rb') as f:
        fimiliarWords = pickle.load(f)
    for word in words:
        if word not in fimiliarWords:
            difficult += 1
    percent = (difficult / NoOfWords) * 100
    if (percent > 5):
        adjusted = 3.6365
    D = 0.1579 * (percent) + 0.0496 * (NoOfWords / NoOfSectences) + adjusted
    return D


# ------------------------------------------------------------------
def GunningFoxIndex(text, NoOfSentences):
    words = RemoveSpecialCHs(text)
    NoOFWords = float(len(words))
    complexWords = 0
    for word in words:
        if (syllable_count(word) > 2):
            complexWords += 1

    G = 0.4 * ((NoOFWords / NoOfSentences) + 100 * (complexWords / NoOFWords))
    return G


def getNumSentences(text):
    return len([s for s in sent_tokenize(text)])


def PrepareData(text1, text2, Winsize):
    chunks1 = slidingWindow(text1, Winsize, Winsize)
    chunks2 = slidingWindow(text2, Winsize, Winsize)
    return " ".join(str(chunk1) + str(chunk2) for chunk1, chunk2 in zip(chunks1, chunks2))


# ------------------------------------------------------------------

# returns a feature vector of text
def FeatureExtration(text):
    # cmu dictionary for syllables
    global cmuDictionary
    cmuDictionary = cmudict.dict()

    chunk = text
    feature = []

    sent_num = getNumSentences(chunk)

    # LEXICAL FEATURES
    meanwl = (Avg_wordLength(chunk))
    feature.append(meanwl)
    
    meansl = (Avg_SentLenghtByCh(chunk))
    feature.append(meansl)
    
    mean = (Avg_SentLenghtByWord(chunk))
    feature.append(mean)
    
    meanSyllable = Avg_Syllable_per_Word(chunk)
    feature.append(meanSyllable)
    
    means = CountSpecialCharacter(chunk)
    feature.append(means)
    
    p = CountPuncuation(chunk)
    feature.append(p)
    
    f = CountFunctionalWords(text)
    feature.append(f)
    
    print("1/2 feature")
    # VOCABULARY RICHNESS FEATURES
    
    TTratio = typeTokenRatio(chunk)
    feature.append(TTratio)
    
    HonoreMeasureR, hapax = hapaxLegemena(chunk)
    feature.append(hapax)
    feature.append(HonoreMeasureR)
    
    YuleK = YulesCharacteristicK(chunk)
    feature.append(YuleK)
    
    S = SimpsonsIndex(chunk)
    feature.append(S)
    
    Shannon = ShannonEntropy(text)
    feature.append(Shannon)

    # READIBILTY FEATURES
    FR = FleschReadingEase(chunk, sent_num)
    feature.append(FR)

    FC = FleschCincadeGradeLevel(chunk, sent_num)
    feature.append(FC)

    # also quite a different
    D = dale_chall_readability_formula(chunk, sent_num)
    feature.append(D)

    # quite a difference
    G = GunningFoxIndex(chunk, sent_num)
    feature.append(G)
    
    print("finish feature")
    #vector.append(feature)

    return feature


# if __name__ == '__main__':

#     # You can try any text file here
#     text = open("my_cd_1.txt").read()

#     vector = FeatureExtration(text)
#     print(vector)

[nltk_data] Downloading package cmudict to
[nltk_data]     C:\Users\bscam\AppData\Roaming\nltk_data...
[nltk_data]   Package cmudict is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bscam\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk import FreqDist
from urllib import request
from nltk.corpus import stopwords
import random
stop_words=set(stopwords.words("english"))

charles_dickens = ["https://www.gutenberg.org/files/98/98-0.txt", "https://www.gutenberg.org/files/46/46-0.txt", "https://www.gutenberg.org/files/1400/1400-0.txt", "https://www.gutenberg.org/files/730/730-0.txt", "https://www.gutenberg.org/files/766/766-0.txt", "https://www.gutenberg.org/cache/epub/25985/pg25985.txt", "https://www.gutenberg.org/files/676/676-0.txt", "https://www.gutenberg.org/cache/epub/1023/pg1023.txt", "https://www.gutenberg.org/cache/epub/37121/pg37121.txt", "https://www.gutenberg.org/files/42232/42232-0.txt", "https://www.gutenberg.org/cache/epub/41894/pg41894.txt", "https://www.gutenberg.org/cache/epub/1415/pg1415.txt", "https://www.gutenberg.org/cache/epub/1394/pg1394.txt"]

marry_shelly = ["https://www.gutenberg.org/files/84/84-0.txt", "https://www.gutenberg.org/files/18247/18247-0.txt", "https://www.gutenberg.org/cache/epub/15238/pg15238.txt", "https://www.gutenberg.org/cache/epub/6447/pg6447.txt", "https://www.gutenberg.org/files/56665/56665-0.txt", "https://www.gutenberg.org/files/63337/63337-0.txt", "https://www.gutenberg.org/files/63338/63338-0.txt", "https://www.gutenberg.org/files/63339/63339-0.txt", "https://www.gutenberg.org/files/64555/64555-0.txt", "https://www.gutenberg.org/files/64556/64556-0.txt", "https://www.gutenberg.org/files/64557/64557-0.txt", "https://www.gutenberg.org/cache/epub/4695/pg4695.txt"]

austin_jane = ["https://www.gutenberg.org/files/1342/1342-0.txt", "https://www.gutenberg.org/files/158/158-0.txt", "https://www.gutenberg.org/files/161/161-0.txt", "https://www.gutenberg.org/cache/epub/105/pg105.txt", "https://www.gutenberg.org/files/121/121-0.txt", "https://www.gutenberg.org/files/63569/63569-0.txt", "https://www.gutenberg.org/files/141/141-0.txt", "https://www.gutenberg.org/cache/epub/946/pg946.txt", "https://www.gutenberg.org/cache/epub/42078/pg42078.txt", "https://www.gutenberg.org/files/1212/1212-0.txt"]


mark_twain = ["https://www.gutenberg.org/files/142/142-0.txt", "https://www.gutenberg.org/files/76/76-0.txt", "https://www.gutenberg.org/files/76/76-0.txt", "https://www.gutenberg.org/files/3184/3184-0.txt", "https://www.gutenberg.org/files/3179/3179-0.txt", "https://www.gutenberg.org/cache/epub/19987/pg19987.txt",
"https://www.gutenberg.org/files/3187/3187-0.txt", "https://www.gutenberg.org/files/86/86-0.txt", "https://www.gutenberg.org/files/3192/3192-0.txt", "https://www.gutenberg.org/files/3180/3180-0.txt", "https://www.gutenberg.org/files/3178/3178-0.txt", "https://www.gutenberg.org/files/3176/3176-0.txt"]

hg_wells = ["https://www.gutenberg.org/files/59774/59774-0.txt", "https://www.gutenberg.org/files/524/524-0.txt", "https://www.gutenberg.org/cache/epub/19229/pg19229.txt", "https://www.gutenberg.org/files/59769/59769-0.txt", "https://www.gutenberg.org/files/1013/1013-0.txt", "https://www.gutenberg.org/files/456/456-0.txt", "https://www.gutenberg.org/cache/epub/11502/pg11502.txt", "https://www.gutenberg.org/cache/epub/3690/pg3690.txt", "https://www.gutenberg.org/files/1046/1046-0.txt", "https://www.gutenberg.org/files/3797/3797-0.txt", "https://www.gutenberg.org/files/5230/5230-0.txt", "https://www.gutenberg.org/cache/epub/159/pg159.txt", "https://www.gutenberg.org/cache/epub/39162/pg39162.txt", "https://www.gutenberg.org/cache/epub/11640/pg11640.txt", "https://www.gutenberg.org/files/1047/1047-0.txt", "https://www.gutenberg.org/files/60173/60173-0.txt"]


link_to_authors = [charles_dickens, marry_shelly, austin_jane, mark_twain, hg_wells]

link_to_authors = link_to_authors[:2] #if i want to make it small for testing purposes

def GetRawText(url):
    response = request.urlopen(url)
    raw = response.read().decode('utf8', "ignore")
    #remove non book stuff
    start_index = raw.find("***")
    end_of_line = raw.find("\n", start_index)
    return raw[end_of_line : ]


trainX = []
trainY = []

testX = []
testY = []


testX_text = []

In [3]:


#print(GetRawText("https://www.gutenberg.org/files/676/676-0.txt"))
#print(FeatureExtration(GetRawText("https://www.gutenberg.org/files/676/676-0.txt")))



for a, author in enumerate(link_to_authors):
    print(author)
    randomindex = [x for x in range(len(author))]
    random.shuffle(randomindex) 
    author = [author[ri] for ri in randomindex]

    for l, link in enumerate(author):
        print(link)
        if l < len(author) - 2:
            trainY.append(a)
            trainX.append(FeatureExtration(GetRawText(link)))
        else:
            print("in test append")
            testY.append(a)
            testX_text.append(GetRawText(link))
            testX.append(FeatureExtration(testX_text[-1]))


randomindex = [x for x in range(len(trainX))]
random.shuffle(randomindex)

trainX = [trainX[ri] for ri in randomindex]
trainY = [trainY[ri] for ri in randomindex]


['https://www.gutenberg.org/files/98/98-0.txt', 'https://www.gutenberg.org/files/46/46-0.txt', 'https://www.gutenberg.org/files/1400/1400-0.txt', 'https://www.gutenberg.org/files/730/730-0.txt', 'https://www.gutenberg.org/files/766/766-0.txt', 'https://www.gutenberg.org/cache/epub/25985/pg25985.txt', 'https://www.gutenberg.org/files/676/676-0.txt', 'https://www.gutenberg.org/cache/epub/1023/pg1023.txt', 'https://www.gutenberg.org/cache/epub/37121/pg37121.txt', 'https://www.gutenberg.org/files/42232/42232-0.txt', 'https://www.gutenberg.org/cache/epub/41894/pg41894.txt', 'https://www.gutenberg.org/cache/epub/1415/pg1415.txt', 'https://www.gutenberg.org/cache/epub/1394/pg1394.txt']
https://www.gutenberg.org/files/1400/1400-0.txt
1/2 feature
finish feature
https://www.gutenberg.org/cache/epub/37121/pg37121.txt
1/2 feature
finish feature
https://www.gutenberg.org/files/676/676-0.txt
1/2 feature
finish feature
https://www.gutenberg.org/files/730/730-0.txt
1/2 feature
finish feature
https://w

In [4]:
print(len(trainX))
print(len(testX))

21
4


In [5]:
#only run this cell if you wand to save the training and testing data dimensions
all_data = [testX, testY, trainX, trainY, testX_text]

with open("bookdata.json", "w") as f:
    json.dump(all_data,f, indent=2 )

In [6]:
#only run this if you want to load previously saved data
with open("bookdata.json", "r") as f:
    all_data = json.load(f)
#print(all_data)

testX  = all_data[0]
testY = all_data[1]
trainX = all_data[2]
trainY = all_data[3]
testX_text = all_data[4]

In [7]:
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

clf = make_pipeline(StandardScaler(), GaussianNB())
clf.fit(trainX, trainY)

print(clf.predict(testX))
predy = clf.predict(testX)
print(type(predy))

correct = 0
for i, ans in enumerate(testY):
    if predy[i] == ans:
        correct += 1

print(correct)
print(correct / float(len(testY)))

[0 0 1 1]
<class 'numpy.ndarray'>
4
1.0


In [15]:
print(clf.predict(testX))
print(testY)


[0 0 1 1]
[0, 0, 1, 1]


In [9]:
# from sklearn.ensemble import RandomForestClassifier

# clf = make_pipeline(StandardScaler(), RandomForestClassifier())
# clf.fit(trainX, trainY)

# print(clf.predict(testX))
# predy = clf.predict(testX)
# print(type(predy))

# correct = 0
# for i, ans in enumerate(testY):
#     if predy[i] == ans:
#         correct += 1

# print(correct)
# print(correct / float(len(testY)))

In [13]:
from not_adj_transform import change_sent_not
from clauseswitch_transform import clauseswitch

#add transforms and test
def transformaiton_pipeline(input_text):

    #not transform
    tokens = sent_tokenize(input_text)
    num_tokens = len(tokens)
    transformed = []
    for i,token in enumerate(tokens):
        after_transform = change_sent_not(token)
        transformed.append(clauseswitch(after_transform))
        if i % 200 == 0:
            print("{}/{}".format(i, num_tokens), end=" ")
    
    

    transform1_text = "".join(transformed)

    return transform1_text


transformed_text = []

for txt in testX_text:
    transformed_text.append(transformaiton_pipeline(txt))




0/2912 200/2912 400/2912 600/2912 boy_wonder
800/2912 absurdity
clasp
aeronautical_engineering
1000/2912 announce
1200/2912 bishop
1400/2912 1600/2912 1800/2912 destitution
2000/2912 2200/2912 flit
2400/2912 2600/2912 acolyte
2800/2912 0/765 bring
400/2318 600/2318 aeronautical_engineering
800/2318 advance
high
1200/2318 1400/2318 aeronautical_engineering
1600/2318 1800/2318 feel_for
2000/2318 2200/2318 0/5084 200/5084 profanation
400/5084 ambusher
600/5084 800/5084 1000/5084 1200/5084 1400/5084 attractiveness
1600/5084 aeronautical_engineering
aeronautical_engineering
1800/5084 antagonize
2000/5084 feel
feel_for
2200/5084 2400/5084 alienation
aeronautical_engineering
2600/5084 accompany
2800/5084 3000/5084 3200/5084 acidify
3400/5084 high
3600/5084 aeronautical_engineering
3800/5084 4000/5084 bore
aeronautical_engineering
4200/5084 4400/5084 4600/5084 4800/5084 5000/5084 

In [17]:
transformed_testX = []

for txt in transformed_text:
    transformed_testX.append(FeatureExtration(txt)) 

print(clf.predict(transformed_testX))

1/2 feature
finish feature
1/2 feature
finish feature
1/2 feature
finish feature
1/2 feature
finish feature
[1 1 1 1]
