In [None]:
# last file with Python mining

# importing stuff

% matplotlib inline

import codecs
import re
import copy
import collections

import numpy as np
import pandas as pd
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import WordPunctTokenizer
from __future__ import division


In [None]:
# I need some specialized functions from NLTK that are not included
# by default

nltk.download('all')

In [None]:
# Getting the "stopwords" package from NLTK.

from nltk.corpus import stopwords

In [None]:
# Let's read data

with codecs.open("JaneEyre.txt", "r", encoding="utf-8") as f:
    jane_eyre = f.read()
with codecs.open("WutheringHeights.txt", "r", encoding="utf-8") as f:
    wuthering_heights = f.read()

In [None]:
# Processing Data; Checking english stopwords

esw = stopwords.words('english')
esw.append("would")

In [None]:
# Filtering tokens (using regular expressions).
word_pattern = re.compile("^\w+$")

In [None]:
# Creating a token counter function.

def get_text_counter(text):
    tokens = WordPunctTokenizer().tokenize(PorterStemmer().stem(text))
    tokens = list(map(lambda x: x.lower(), tokens))
    tokens = [token for token in tokens if re.match(word_pattern, token) and token not in esw]
    return collections.Counter(tokens), len(tokens)


In [None]:
# Creating a function to calculate the absolute frequency and relative 
# frequency of the most common words.

def make_df(counter, size):
    abs_freq = np.array([el[1] for el in counter])
    rel_freq = abs_freq / size
    index = [el[0] for el in counter]
    df = pd.DataFrame(data=np.array([abs_freq, rel_freq]).T, index=index, columns=["Absolute frequency", "Relative frequency"])
    df.index.name = "Most common words"
    return df

In [None]:
# Analyzing individual texts

# Calculating the most common words of Jane Eyre. This takes a while. 
# Then displaying the 10 most common.

je_counter, je_size = get_text_counter(jane_eyre)
make_df(je_counter.most_common(10), je_size)


In [None]:
# Saving the 1000 most common words of Jane Eyre to CSV.

je_df = make_df(je_counter.most_common(1000), je_size)
je_df.to_csv("JE_1000.csv")


In [None]:
# Calculating the most common words of Wuthering Heights. This also 
# takes a while. Displaying the 10 most common.

wh_counter, wh_size = get_text_counter(wuthering_heights)
make_df(wh_counter.most_common(10), wh_size)

In [None]:
# Saving the 1000 most common words of Wuthering Heights to CSV.

wh_df = make_df(wh_counter.most_common(1000), wh_size)
wh_df.to_csv("WH_1000.csv")

In [None]:
# Now let's Compare texts

# Find the most common words across the two documents.

all_counter = wh_counter + je_counter
all_df = make_df(wh_counter.most_common(1000), 1)
most_common_words = all_df.index.values


In [None]:
# Creating a data frame with the word frequency differences.

df_data = []
for word in most_common_words:
    je_c = je_counter.get(word, 0) / je_size
    wh_c = wh_counter.get(word, 0) / wh_size
    d = abs(je_c - wh_c)
    df_data.append([je_c, wh_c, d])
dist_df = pd.DataFrame(data=df_data, index=most_common_words,
                       columns=["Jane Eyre relative frequency", "Wuthering Heights relative frequency",
                                "Relative frequency difference"])
dist_df.index.name = "Most common words"
dist_df.sort_values("Relative frequency difference", ascending=False, inplace=True)


In [None]:
# Displaying the most distinctive words.

dist_df.head(10)

In [None]:
# Saving the full list of distinctive words to a CSV named "bronte.csv."

dist_df.to_csv("bronte.csv")