In [0]:
import pandas as pd

import collections as coll
import math
import pickle
import string

import matplotlib.pyplot as plt
import numpy as np
from matplotlib import style
from nltk.corpus import cmudict
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

import nltk

nltk.download('cmudict')
nltk.download('stopwords')
nltk.download('punkt')

from pyspark.sql import functions as F
from pyspark.sql.functions import col, udf
import scipy as sc

[nltk_data] Downloading package cmudict to
[nltk_data]     /Users/geyuanyuan1/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/geyuanyuan1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/geyuanyuan1/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [0]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import *

In [0]:
spark = SparkSession.builder.appName("Blogger_writing_style").getOrCreate()

**Import files** 

In [0]:
posts = spark\
        .read\
        .option('header','false')\
        .option('inferSchema','true')\
        .csv(r'posts.csv')

In [0]:
colnames=['Post_ID','Title','Blogger_Name','Blogger_ID','Number of comments','Content','URL','Date',
          'Number of retrieved inlinks','Number of retrieved comments','Post Length words',
          'Post Length words no stopwords','Average word length characters',
          'Average word length characters no stopwords','MEIBI score','MEIBIX score']

In [0]:
i=0
for colname in colnames:
    to_replace_str = '_c'+str(i)
    posts = posts.withColumnRenamed(to_replace_str, colname)
    i += 1

In [0]:
posts.show()

+-------+--------------------+-----------------+----------+------------------+--------------------+--------------------+----------+---------------------------+----------------------------+-----------------+------------------------------+------------------------------+-------------------------------------------+-----------+------------+
|Post_ID|               Title|     Blogger_Name|Blogger_ID|Number of comments|             Content|                 URL|      Date|Number of retrieved inlinks|Number of retrieved comments|Post Length words|Post Length words no stopwords|Average word length characters|Average word length characters no stopwords|MEIBI score|MEIBIX score|
+-------+--------------------+-----------------+----------+------------------+--------------------+--------------------+----------+---------------------------+----------------------------+-----------------+------------------------------+------------------------------+-------------------------------------------+-----------+

In [0]:
#### functions to clean contents

In [0]:
def RemoveSpecialCHs(text):
    text = word_tokenize(text)
    st = [",", ".", "'", "!", '"', "#", "$", "%", "&", "(", ")", "*", "+", "-", ".", "/", ":", ";", "<", "=", '>', "?",
          "@", "[", "\\", "]", "^", "_", '`', "{", "|", "}", '~', '\t', '\n']

    words = [word for word in text if word not in st]
    return words

In [0]:
def syllable_count_Manual(word):
    word = word.lower()
    count = 0
    vowels = "aeiouy"
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
            if word.endswith("e"):
                count -= 1
    if count == 0:
        count += 1
    return count

In [0]:
# COUNTS NUMBER OF SYLLABLES

def syllable_count(word):
    global cmuDictionary
    d = cmuDictionary
    try:
        syl = [len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]][0]
    except:
        syl = syllable_count_Manual(word)
    return syl

#### functions to find  **Lexical Features:**
* Average Word Length
* Average Sentence Length By Word
* Average Sentence Length By Character
* Special Character Count
* Average Syllable per Word
* Functional Words Count
* Punctuation Count


These are the most basic features one can extract from the text. These features tell us about the structure of the text. For example averages of different counts like word lengths, special characters, punctuations and functional words etc. Functional words are used to express grammatical relationships among other words within a sentence. Secondly, if a word has more syllables then it is most likely to be a difficult word (although not necessary). Avg Syllable per word being the measure of complexity, is used in calculations of many other features related to readability scores described in the sections ahead. Punctuation Countand Special Character Count are straight forward ways to differentiate different genres. For example narrative story and research paper.

In [0]:
# removing stop words plus punctuation.
def Avg_wordLength(text):
    try:
        text.translate(string.punctuation)
        tokens = word_tokenize(text, language='english')
        st = [",", ".", "'", "!", '"', "#", "$", "%", "&", "(", ")", "*", "+", "-", ".", "/", ":", ";", "<", "=", '>', "?",
              "@", "[", "\\", "]", "^", "_", '`', "{", "|", "}", '~', '\t', '\n']
        stop = stopwords.words('english') + st
        words = [word for word in tokens if word not in stop]
        return float(np.average([len(word) for word in words]))
    except:
        return 0

In [0]:
find_feature = F.udf(lambda text: Avg_wordLength(text))
posts = posts.withColumn('meanwl', find_feature(F.col('Content')))

In [0]:
# returns avg number of words in a sentence
def Avg_SentLenghtByWord(text):
    try:
        tokens = sent_tokenize(text)
        return float(np.average([len(token.split()) for token in tokens]))
    except:
        return 0

In [0]:
find_feature = F.udf(lambda text: Avg_SentLenghtByWord(text))
posts = posts.withColumn('mean', find_feature(F.col('Content')))

In [0]:
# returns avg number of characters in a sentence
def Avg_SentLenghtByCh(text):
    try:
        tokens = sent_tokenize(text)
        return float(np.average([len(token) for token in tokens]))
    except:
        return 0

In [0]:
find_feature = F.udf(lambda text: Avg_SentLenghtByCh(text))
posts = posts.withColumn('meansl', find_feature(F.col('Content')))

In [0]:
# COUNTS SPECIAL CHARACTERS NORMALIZED OVER LENGTH OF CHUNK
def CountSpecialCharacter(text):
    try:
        st = ["#", "$", "%", "&", "(", ")", "*", "+", "-", "/", "<", "=", '>',
          "@", "[", "\\", "]", "^", "_", '`', "{", "|", "}", '~', '\t', '\n']
        count = 0
        for i in text:
            if (i in st):
                count = count + 1
        return float(count / len(text))
    except:
        return 0

In [0]:
find_feature = F.udf(lambda text: CountSpecialCharacter(text))
posts = posts.withColumn('means', find_feature(F.col('Content')))

In [0]:
# GIVES NUMBER OF SYLLABLES PER WORD
def Avg_Syllable_per_Word(text):
    try:
        tokens = word_tokenize(text, language='english')
        st = [",", ".", "'", "!", '"', "#", "$", "%", "&", "(", ")", "*", "+", "-", ".", "/", ":", ";", "<", "=", '>', "?",
              "@", "[", "\\", "]", "^", "_", '`', "{", "|", "}", '~', '\t', '\n']
        stop = stopwords.words('english') + st
        words = [word for word in tokens if word not in stop]
        syllabls = [syllable_count(word) for word in words]
        p = (" ".join(words))
        return float(sum(syllabls) / max(1, len(words)))
    except:
        return 0

In [0]:
find_feature = F.udf(lambda text: Avg_Syllable_per_Word(text))
posts = posts.withColumn('meanSyllable', find_feature(F.col('Content')))

In [0]:
# RETURNS NORMALIZED COUNT OF FUNCTIONAL WORDS FROM A Framework for
# Authorship Identification of Online Messages: Writing-Style Features and Classification Techniques

def CountFunctionalWords(text):
    try:
        functional_words = """a between in nor some upon
        about both including nothing somebody us
        above but inside of someone used
        after by into off something via
        all can is on such we
        although cos it once than what
        am do its one that whatever
        among down latter onto the when
        an each less opposite their where
        and either like or them whether
        another enough little our these which
        any every lots outside they while
        anybody everybody many over this who
        anyone everyone me own those whoever
        anything everything more past though whom
        are few most per through whose
        around following much plenty till will
        as for must plus to with
        at from my regarding toward within
        be have near same towards without
        because he need several under worth
        before her neither she unless would
        behind him no should unlike yes
        below i nobody since until you
        beside if none so up your
        """

        functional_words = functional_words.split()
        words = RemoveSpecialCHs(text)
        count = 0

        for i in text:
            if i in functional_words:
                count += 1

        return float(count / len(words))
    except:
        return 0

In [0]:
find_feature = F.udf(lambda text: CountFunctionalWords(text))
posts = posts.withColumn('f', find_feature(F.col('Content')))

In [0]:
def CountPuncuation(text):
    try:
        st = [",", ".", "'", "!", '"', ";", "?", ":", ";"]
        count = 0
        for i in text:
            if (i in st):
                count = count + 1
        return float(count) / float(len(text))
    except:
        return 0

In [0]:
find_feature = F.udf(lambda text: CountPuncuation(text))
posts = posts.withColumn('p', find_feature(F.col('Content')))

#### functions to find  **Vocabulary Richness Features:**
Many quantitative studies rely on the concept of vocabulary richness. A text has low vocabulary richness if the same limited vocabulary is repeated over and over again, while it has high vocabulary richness if new words continually appear. In essence, these features tell us about the diversity and richness of the vocabulary used in the text.

* Hapax Legomenon **V**
* Hapax DisLegemena(Sichel’s Measure) **V**
* Honores R Measure **V**
* Brunets Measure W **V** 
* Yules Characteristic K **V** 
* Shannon Entropy **V** （also readability feature）
* Simpson’s Index **V** （also readability feature）

In [0]:
# Hapax Legomenon
# TYPE TOKEN RATIO NO OF DIFFERENT WORDS / NO OF WORDS
def typeTokenRatio(text):
    try:    
        words = word_tokenize(text)
        return float(len(set(words)) / len(words))
    except:
        return 0

In [0]:
find_feature = F.udf(lambda text: typeTokenRatio(text))
posts = posts.withColumn('TTratio', find_feature(F.col('Content')))

In [0]:
# Hapax DisLegemena (Sichel’s Measure)
def hapaxDisLegemena(text):
    try:
        words = RemoveSpecialCHs(text)
        count = 0
        # Collections as coll Counter takes an iterable collapse duplicate and counts as
        # a dictionary how many equivelant items has been entered
        freqs = coll.Counter()
        freqs.update(words)
        for word in freqs:
            if freqs[word] == 2:
                count += 1

        h = float(count / float(len(words)))
        S = float(count / float(len(set(words))))
        return [S, h]
    except:
        return [0,0]

In [0]:
find_feature = F.udf(lambda text: hapaxDisLegemena(text)[0])
posts = posts.withColumn('SichelesMeasureS', find_feature(F.col('Content')))

In [0]:
find_feature = F.udf(lambda text: hapaxDisLegemena(text)[1])
posts = posts.withColumn('dihapax', find_feature(F.col('Content')))

In [0]:
# Honore Measure R
# return Honore Measure R
def hapaxLegemena(text):
    try:
        words = RemoveSpecialCHs(text)
        V1 = 0
        # dictionary comprehension . har word kay against value 0 kardi
        freqs = {key: 0 for key in words}
        for word in words:
            freqs[word] += 1
        for word in freqs:
            if freqs[word] == 1:
                V1 += 1
        N = len(words)
        V = float(len(set(words)))
        R = 100 * math.log(N) / max(1, (1 - (V1 / V)))
        h = V1 / N
        return [R, h]
    except:
            return [0,0]

In [0]:
find_feature = F.udf(lambda text: hapaxLegemena(text)[0])
posts = posts.withColumn('HonoreMeasureR', find_feature(F.col('Content')))

In [0]:
find_feature = F.udf(lambda text: hapaxLegemena(text)[1])
posts = posts.withColumn('hapax', find_feature(F.col('Content')))

In [0]:
# Brunets Measure W
# logW = V-a/log(N)
# N = total words , V = vocabulary richness (unique words) ,  a=0.17
# we can convert into log because we are only comparing different texts
def BrunetsMeasureW(text):
    try:
        words = RemoveSpecialCHs(text)
        a = 0.17
        V = float(len(set(words)))
        N = len(words)
        B = (V - a) / (math.log(N))
        return float(B)
    except:
        return 0

In [0]:
find_feature = F.udf(lambda text: BrunetsMeasureW(text))
posts = posts.withColumn('B', find_feature(F.col('Content')))

In [0]:
# K  10,000 * (M - N) / N**2
# , where M  Sigma i**2 * Vi.
def YulesCharacteristicK(text):
    try:
        words = RemoveSpecialCHs(text)
        N = len(words)
        freqs = coll.Counter()
        freqs.update(words)
        vi = coll.Counter()
        vi.update(freqs.values())
        M = sum([(value * value) * vi[value] for key, value in freqs.items()])
        K = 10000 * (M - N) / math.pow(N, 2)
        return float(K)
    except:
        return 0

In [0]:
find_feature = F.udf(lambda text: YulesCharacteristicK(text))
posts = posts.withColumn('YuleK', find_feature(F.col('Content')))

#### functions to find  **Readability Scores:**
Readability is the ease with which a reader can understand a written text. Readability is more than simply legibility—which is a measure of how easily a reader can distinguish individual letters or characters from each other. Features for readability stems from the field of linguistics and researchers have frequently used linguistics’ laws (e.g zipfs law) and lemmas to pull out the currently used features to calculate readability scores of text in the modern computer science. Following is the list of features we are using.

* Flesch Reading Ease **V**
* Flesch-Kincaid Grade Level **V**
* Gunning Fog Index  **V**
* Dale Chall Readability Formula  **V**
* Shannon Entropy (also vocabulary richness feature）**V**
* Simpson's Index (also vocabulary richness feature）**V**

In [0]:
def FleschReadingEase(text):
    try:
        NoOfSectences = len(sent_tokenize(text))
        words = RemoveSpecialCHs(text)
        l = float(len(words))
        scount = 0
        for word in words:
            scount += syllable_count(word)

        I = 206.835 - 1.015 * (l / float(NoOfsentences)) - 84.6 * (scount / float(l))
        return float(I)
    except:
        return 0

In [0]:
find_feature = F.udf(lambda text: FleschReadingEase(text))
posts = posts.withColumn('FR', find_feature(F.col('Content')))

In [0]:
def FleschCincadeGradeLevel(text):
    try:
        NoOfSectences = len(sent_tokenize(text))
        words = RemoveSpecialCHs(text)
        scount = 0
        for word in words:
            scount += syllable_count(word)

        l = len(words)
        F = 0.39 * (l / NoOfSentences) + 11.8 * (scount / float(l)) - 15.59
        return float(F)
    except:
        return 0

In [0]:
find_feature = F.udf(lambda text: FleschCincadeGradeLevel(text))
posts = posts.withColumn('FC', find_feature(F.col('Content')))

In [0]:
def GunningFoxIndex(text):
    try:
        NoOfSectences = len(sent_tokenize(text))
        words = RemoveSpecialCHs(text)
        NoOFWords = float(len(words))
        complexWords = 0
        for word in words:
            if (syllable_count(word) > 2):
                complexWords += 1

        G = 0.4 * ((NoOFWords / NoOfSentences) + 100 * (complexWords / NoOFWords))
        return float(G)
    except:
        return 0

In [0]:
find_feature = F.udf(lambda text: GunningFoxIndex(text))
posts = posts.withColumn('G', find_feature(F.col('Content')))

In [0]:
def dale_chall_readability_formula(text):
    try:
        NoOfSectences = len(sent_tokenize(text))
        words = RemoveSpecialCHs(text)
        difficult = 0
        adjusted = 0
        NoOfWords = len(words)
        with open('dale-chall.pkl', 'rb') as f:
            fimiliarWords = pickle.load(f)
        for word in words:
            if word not in fimiliarWords:
                difficult += 1
        percent = (difficult / NoOfWords) * 100
        if (percent > 5):
            adjusted = 3.6365
        D = 0.1579 * (percent) + 0.0496 * (NoOfWords / NoOfSectences) + adjusted
        return float(D)
    except:
        return 0

In [0]:
find_feature = F.udf(lambda text: dale_chall_readability_formula(text))
posts = posts.withColumn('D', find_feature(F.col('Content')))

In [0]:
# -1*sigma(pi*lnpi)
# Shannon and sympsons index are basically diversity indices for any community
def ShannonEntropy(text):
    try:
        words = RemoveSpecialCHs(text)
        lenght = len(words)
        freqs = coll.Counter()
        freqs.update(words)
        arr = np.array(list(freqs.values()))
        distribution = 1. * arr
        distribution /= max(1, lenght)
        import scipy as sc
        H = sc.stats.entropy(distribution, base=2)
        # H = sum([(i/lenght)*math.log(i/lenght,math.e) for i in freqs.values()])
        return float(H)
    except:
        return 0

In [0]:
find_feature = F.udf(lambda text: ShannonEntropy(text))
posts = posts.withColumn('Shannon', find_feature(F.col('Content')))

In [0]:
# 1 - (sigma(n(n - 1))/N(N-1)
# N is total number of words
# n is the number of each type of word
def SimpsonsIndex(text):
    try:
        words = RemoveSpecialCHs(text)
        freqs = coll.Counter()
        freqs.update(words)
        N = len(words)
        n = sum([1.0 * i * (i - 1) for i in freqs.values()])
        S = 1 - (n / (N * (N - 1)))
        return float(S)
    except:
        return 0

In [0]:
find_feature = F.udf(lambda text: SimpsonsIndex(text))
posts = posts.withColumn('S', find_feature(F.col('Content')))

In [0]:
print((posts.count(), len(posts.columns)))

(19464, 36)


In [0]:
posts.toPandas().to_csv('posts_match_writing_style.csv',index=False)