In [42]:
# key libs
import numpy as np
import re
import nltk
import pandas as pd
import glob
import codecs

# nlp libs
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer 

# processing
from sklearn.model_selection import train_test_split

# textblob wrapper
from textblob import TextBlob
from nltk.tokenize import sent_tokenize


In [43]:
# set random state
RSEED =0

In [44]:
train_list = pd.read_csv('../data/train_file_list.csv')
test_list = pd.read_csv('../data/test_file_list.csv')
train_list = train_list.drop(columns ='Unnamed: 0')
test_list = test_list.drop(columns ='Unnamed: 0')

In [45]:
#train_list.head()
test_list.head()

Unnamed: 0,index,file_extension,title,author
0,480,../data/gutenberg/George Bernard Shaw___Great ...,Great Catherine,George Bernard Shaw
1,2319,../data/gutenberg/William Wymark Jacobs___Shor...,Short Cruises,William Wymark Jacobs
2,2491,../data/gutenberg/Edward Stratemeyer___Richard...,Richard Dare's Venture,Edward Stratemeyer
3,1191,../data/gutenberg/Thomas Henry Huxley___Willia...,William Harvey And The Discovery Of The Circul...,Thomas Henry Huxley
4,572,../data/gutenberg/John Galsworthy___Beyond.txt,Beyond,John Galsworthy


In [46]:
test_list.shape

(607, 4)

In [47]:
train_files = list(train_list['file_extension'])
test_files = list(test_list['file_extension'])

### 1. SENTIMENT ANALYSIS ON TRAIN
Just use a pre-trained black box

In [48]:
sentiment_train = []
for file_link in train_files:
    file=open(file_link)
    t=file.read()
    book = TextBlob(t)
    word_count = len(book.words)
    sentence_count =len(book.sentences)
    avg_len = word_count/sentence_count
    sentiment_train.append([word_count,sentence_count,avg_len,book.sentiment[0],book.sentiment[1]])
    print(file_link,'--done--')           

../data/gutenberg/Stephen Leacock___Behind the Beyond.txt --done--
../data/gutenberg/Jerome Klapka Jerome___Tommy and Co.txt --done--
../data/gutenberg/Stephen Leacock___Winsome Winnie and other New Nonsense Novels.txt --done--
../data/gutenberg/Hamlin Garland___The Moccasin Ranch.txt --done--
../data/gutenberg/Charles Dickens___Three Ghost Stories.txt --done--
../data/gutenberg/P G Wodehouse___The Man with Two Left Feet.txt --done--
../data/gutenberg/Herbert Spencer___Essays: Scientific, Political, & Speculative, Volume I.txt --done--
../data/gutenberg/William Dean Howells___The Lady of the Aroostook.txt --done--


KeyboardInterrupt: 

In [None]:
df_train = pd.DataFrame(sentiment_train, columns = ['word_count','sentence_count','sentence_length','polarity','subjectivity'])
df_train.insert(loc=0, column='book_location', value=train_list['file_extension'])
df_train.insert(loc=0, column='author_name', value=train_list['author'])
df_train.insert(loc=0, column='book_title', value=train_list['title'])

In [None]:
df_train.head()

In [None]:
df_train.to_csv('../data/sentiment_train.csv')

### 2. SENTIMENT ANALYSIS ON TEST
Just use a pre-trained black box

In [None]:
sentiment_test = []
for file_link in test_files:
    file=open(file_link)
    t=file.read()
    book = TextBlob(t)
    word_count = len(book.words)
    sentence_count =len(book.sentences)
    avg_len = word_count/sentence_count
    sentiment_test.append([word_count,sentence_count,avg_len,book.sentiment[0],book.sentiment[1]])
    print(file_link,'--done--')

In [None]:
df_test = pd.DataFrame(sentiment_test, columns = ['word_count','sentence_count','sentence_length','polarity','subjectivity'])
df_test.insert(loc=0, column='book_location', value=test_list['file_extension'])
df_test.insert(loc=0, column='author_name', value=test_list['author'])
df_test.insert(loc=0, column='book_title', value=test_list['title'])

In [None]:
df_test.shape

In [None]:
df_test.head()

In [None]:
df_test.to_csv('../data/sentiment_test.csv')

### 3. EXPLORATORY ANALYSIS OF SENTIMENT

In [None]:
# load train sentiment
df_1 = pd.read_csv('../data/sentiment_train.csv')
df_1 = df_1.drop(columns='Unnamed: 0')
df_1.shape

In [None]:
# load test
df_2 = pd.read_csv('../data/sentiment_test.csv')
df_2 = df_2.drop(columns='Unnamed: 0')
df_2.shape

In [None]:
# join train and test
df = pd.concat([df_1,df_2],axis=0)
print(df.shape)
df.head()

In [None]:
gp1 = df.groupby(['author_name']).mean().sort_values(by='polarity',ascending=False).reset_index()
# send to csv for presentation
gp1.to_csv('../images/polarity.csv')

In [None]:
gp1.head()

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
sns.set_style("white")

In [None]:
fig, ax = plt.subplots(figsize=[18,4]);
#sns.set(style="whitegrid")
ax.set_xticklabels(gp1['author_name'],rotation=90, fontsize=7)
#ax.set_yticklabels(labels=)
ax = sns.barplot(x="author_name", y="polarity", data=gp1, color ='#02d8e9');
plt.xlabel("Authors")
plt.ylabel("Polarity")
plt.savefig('../images/polarity.svg', format='svg',transparent=True )

In [None]:
gp2 = df.groupby(['author_name']).mean().sort_values(by='subjectivity',ascending=False).reset_index()
# send to csv for presentation
gp2.to_csv('../images/subjectivity.csv')

In [None]:
gp2.head()

In [None]:
fig, ax = plt.subplots(figsize=[18,4])
#sns.set(style="whitegrid")
ax.set_xticklabels(gp2['author_name'],rotation=90, fontsize=7)
ax = sns.barplot(x="author_name", y="subjectivity", data=gp2,color ='#02d8e9')
plt.xlabel("Authors")
plt.ylabel("Subjectivity")
plt.savefig('../images/subjectivity.svg', format='svg',transparent=True  )

In [None]:
gp3 = df.groupby(['author_name']).mean().sort_values(by='sentence_length',ascending=False).reset_index()
gp3.head()

# send to csv for presentation
gp3.to_csv('../images/length.csv')

In [None]:
fig, ax = plt.subplots(figsize=[18,4])
#sns.set(style="whitegrid")
ax.set_xticklabels(gp2['author_name'],rotation=90, fontsize=5)
ax = sns.barplot(x="author_name", y="sentence_length", data=gp3,color ='#02d8e9')
plt.xlabel("Authors")
plt.ylabel("Average sentence length")
plt.savefig('../images/length.svg', format='svg',transparent=True )