In [1]:
import re
from collections import Counter
from urllib.request import urlopen

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
df = pd.read_csv('data/Quotes.csv')

In [3]:
def get_age(birth, link):
	page = urlopen(link)
	soup = BeautifulSoup(page, 'html.parser')
	age = max(list(filter(lambda x: x < 2021, map(int, re.findall('\d{4}', soup.text))))) - int(birth[:4])
	return age if age != 0 else 2021 - int(birth[:4])

In [5]:
ages = list(map(get_age, df['birthdate'], df['link']))
df.insert(loc=1, column='age', value=ages)

In [6]:
df.groupby(['name', 'age']).head()

Unnamed: 0,name,age,link,birthdate,quote
0,Albert Einstein,129,https://quotes.toscrape.com//author/Albert-Ein...,1879-3-14,The world as we have created it is a process o...
1,J.K. Rowling,47,https://quotes.toscrape.com//author/J-K-Rowling/,1965-7-31,"It is our choices, Harry, that show what we tr..."
2,Albert Einstein,129,https://quotes.toscrape.com//author/Albert-Ein...,1879-3-14,There are only two ways to live your life. One...
3,Jane Austen,165,https://quotes.toscrape.com//author/Jane-Austen/,1775-12-16,"The person, be it gentleman or lady, who has n..."
4,Marilyn Monroe,73,https://quotes.toscrape.com//author/Marilyn-Mo...,1926-6-1,"Imperfection is beauty, madness is genius and ..."
...,...,...,...,...,...
93,E.E. Cummings,68,https://quotes.toscrape.com//author/E-E-Cummings/,1894-10-14,It takes courage to grow up and become who you...
94,Khaled Hosseini,42,https://quotes.toscrape.com//author/Khaled-Hos...,1965-3-4,But better to get hurt by the truth than comfo...
95,Harper Lee,73,https://quotes.toscrape.com//author/Harper-Lee/,1926-4-28,You never really understand a person until you...
96,Madeleine L'Engle,89,https://quotes.toscrape.com//author/Madeleine-...,1918-11-29,You have to write the book that wants to be wr...


In [4]:
df['length'] = df['quote'].apply(lambda x: len(x.split()))

In [5]:
quotes = df.quote.values

In [6]:
bow = CountVectorizer()
bow.fit(quotes)

CountVectorizer()

In [7]:
vocab_counter = Counter(bow.vocabulary_)

In [8]:
vocab_counter.most_common(20)

[('yourself', 662),
 ('yours', 661),
 ('your', 660),
 ('youer', 659),
 ('you', 658),
 ('year', 657),
 ('wrung', 656),
 ('wrote', 655),
 ('written', 654),
 ('writing', 653),
 ('write', 652),
 ('wrap', 651),
 ('would', 650),
 ('worthy', 649),
 ('worth', 648),
 ('world', 647),
 ('work', 646),
 ('wondering', 645),
 ('won', 644),
 ('woman', 643)]

In [9]:
print('Number of unique words: ', len(vocab_counter))

Number of unique words:  663


In [10]:
top20_word = pd.DataFrame(reversed(vocab_counter.most_common(20)), columns=['word', 'feq'])

In [11]:
plt.bar(x='feq', y='word', data=top20_word)
plt.xlim((630, 670))
plt.show()

TypeError: bar() missing 1 required positional argument: 'height'

In [None]:
author = df.name.values


In [None]:
author_counter = Counter(author)

In [None]:
author_counter.most_common(10)

In [None]:
top10_author = pd.DataFrame(reversed(author_counter.most_common(10)), columns=['author', 'total quotes'])


In [None]:
sns.barplot(x='total quotes', y='author', data=top10_author)


In [None]:
df.drop_duplicates(subset='name')

In [None]:
max_idx = np.argmax(df.length.values)
min_idx = np.argmin(df.length.values)
print('The longest quote is:\n', df.quote.values[max_idx])
print('The shortest quote is:\n', df.quote.values[max_idx])
print("Average length is: ", df.length.values.mean())