In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

First comes the libraries we need to import for the data. The Brown corpus is available through NLTK.

In [2]:
import nltk
from nltk.corpus import brown
nltk.download('brown')
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package brown to /usr/share/nltk_data...
[nltk_data]   Package brown is already up-to-date!


The Fictometer algorithm is essentially based on Parts-Of-Speech (POS) tagging, which is a fundamental aspect of NLP. There are various ways in which POS tagging can be done for a given text, but broadly speaking we can either have universal POS tags (noun, adjective, adverb, pronoun, etc) or finer tags which differentiate between various types of nouns, adjectives, etc. For our task, the UPOS tags are good enough and so next we write a function to count the number of various UPOS tags in a given text.

In [3]:
def n_adj(text):
    adj=0
    for i in text:
        if i[0] == 'J':
            adj=adj+1
    return adj

def n_noun(text):
    noun=0
    for i in text:
        if ((i[0] == 'N') and (i[1] != 'C')):
            noun=noun+1
    return noun

def n_verb(text):
    verb=0
    for i in text:
        if i[0] == 'V':
            verb=verb+1
    return verb

def n_pronoun(text):
    pronoun=0
    for i in text:
        if (i[0] == 'P') or (i[:3] in ['WP$','WPO','WPS']):
            pronoun=pronoun+1
    return pronoun

def n_adv(text):
    adv=0
    for i in text:
        if (i[0] == 'R') or (i[:3] in ['WRB']):
            adv=adv+1
    return adv

def func_utag(tag):
    if tag[0] == 'J' or tag == 'ADJ':
        utag='ADJ'
    elif ((tag[0] == 'N') and (tag[1] != 'C')) or tag == 'NOUN':
        utag='NOUN'
    elif tag[0] == 'V' or tag == 'VERB':
        utag='VERB'
    elif (tag[0] == 'P') or (tag[:3] in ['WP$','WPO','WPS']) or tag == 'PRON':
        utag='PRON'
    elif (tag[0] == 'R') or (tag[:3] in ['WRB']) or tag == 'ADV':
        utag='ADV'
    else:
        utag='unknown'
    return utag

def func_is5tag(tag):
    if tag in ['ADJ','ADV','NOUN','PRON','VERB']:
        is5tag=True
    else:
        is5tag=False
    return is5tag

Next we start reading text from the NLTK Brown corpus and create a DataFrame which contains information about the number of different POS tags for each text in the corpus.

In [4]:
brownpostable=pd.DataFrame(columns=['category','filename','ADJ','ADV','NOUN','VERB','PRON','RADJPRON','RADVADJ'])

for i in brown.categories():
    for j in brown.fileids(categories=i):
        taggedwords=brown.tagged_words(j)
        taglist=[]
        for k in taggedwords:
            taglist.append(k[1])
        adj=n_adj(taglist)
        adv=n_adv(taglist)
        noun=n_noun(taglist)
        verb=n_verb(taglist)
        pronoun=n_pronoun(taglist)
        brownpostable=brownpostable.append({'category' : i,'filename' : j, 'ADJ' : int(adj), 'ADV' : int(adv), 'NOUN' : int(noun), 'VERB' : int(verb), 'PRON' : int(pronoun)},ignore_index=True)


Once we have all the UPOS tag information in our DataFrame, we need to calculate the two ratios that are our model features : adjective/pronoun and adverb/adjective.

In [5]:
for i in range(len(brownpostable)):
    adj=brownpostable.ADJ.iloc[i]
    adv=brownpostable.ADV.iloc[i]
    pronoun=brownpostable.PRON.iloc[i]
    brownpostable.RADJPRON.iloc[i]=adj/pronoun
    brownpostable.RADVADJ.iloc[i]=adv/adj

The Brown corpus has several sub-categories and so we need to identify each of them as “fiction” or “non-fiction” depending on its contents.



In [6]:
brown2=brownpostable.copy()
for i in ['news','reviews','government','learned','hobbies']:
    brown2=brown2.replace(to_replace=i,value='nonfiction')

for i in ['fiction','mystery','science_fiction','adventure','romance']:
    brown2=brown2.replace(to_replace=i,value='fiction')
    
index_names=brown2[(brown2['category'] != 'fiction') & (brown2['category'] != 'nonfiction')].index
brown2.drop(index_names,inplace=True)

brown3=brown2.replace(to_replace='nonfiction',value='0')
brown3=brown3.replace(to_replace='fiction',value='1')

Training and Testing using Logistic Regression

Great! So our data is ready for being trained and tested using any ML algorithm. We choose Logistic Regression for its relative simplicity. And it works amazingly well!

In [7]:
from sklearn import preprocessing
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

Next we drop all the unnecessary columns from our DataFrame, extract the input and output values, split them into a training and testing set and fit the Logistic Regression model using the training data.



In [8]:
x=brown3.drop(columns=['category','filename','PRON','ADJ','ADV','NOUN','VERB'])
y=brown3.category

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
logreg = LogisticRegression(solver='lbfgs')
logreg.fit(x_train,y_train)

Finally its time to see the results of our hard work!

In [9]:
y_pred=logreg.predict(x_train)
accuracy = metrics.accuracy_score(y_train,y_pred)
print("Training Accuracy : ",accuracy)

y_pred=logreg.predict(x_test)
accuracy = metrics.accuracy_score(y_test,y_pred)
print("Testing Accuracy : ", accuracy)

print("Confusion Matrix : \n",confusion_matrix(y_test,y_pred))

Training Accuracy :  0.9691119691119691
Testing Accuracy :  0.9692307692307692
Confusion Matrix : 
 [[31  1]
 [ 1 32]]
