### Importing librairies

In [24]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import spacy as sp

from textblob import TextBlob
from langdetect import DetectorFactory, detect, detect_langs

### Setting up NLP pipeline 

In [25]:
nlp = sp.load("en_core_web_sm")

### Test on a sample text

In [None]:
# Process whole documents
text = ("When Sebastian Thrun started working on self-driving cars at "
        "Google in 2007, few people outside of the company took him "
        "seriously. “I can tell you very senior CEOs of major American "
        "car companies would shake my hand and turn away because I wasn’t "
        "worth talking to,” said Thrun, in an interview with Recode earlier "
        "this week.")
doc = nlp(text)

# Analyze syntax
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

# Find named entities, phrases and concepts
for entity in doc.ents:
    print(entity.text, entity.label_)

### Load the data

In [None]:
data = pd.read_csv('project-gutenberg-books.csv')
data.head()

In [None]:
data['Subject'].unique()

In [None]:
data.info()

In [None]:
data.drop(['Unnamed: 0', 'Date'], axis=1, inplace=True)
data.head()

In [None]:
data = data.applymap(str)

In [None]:
data.drop_duplicates(inplace=True, subset=['Author', 'Title'], keep='first')
data.info()

In [None]:
data['Author'].unique()

In [None]:
doc = nlp("Mercy Killing 5 May 2019 11 March 2021 ")

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

In [None]:
data["Author"].replace({
    "Friedrich Nietzsche": "Friedrich Wilhelm Nietzsche", 
    "F. W. Nietzsche": "Friedrich Wilhelm Nietzsche",
    "Friedrich Nietzsche.": "Friedrich Wilhelm Nietzsche",
    "Friedrich Nietzsche.": "Friedrich Wilhelm Nietzsche",
    "graf Leo Tolstoy": "Leo Tolstoy",
    "Grant Hague": "W. Grant Hague"
    }, inplace=True)

In [None]:
data['Author'].unique()

In [None]:
data['Title'] = data['Title'].apply(lambda x: x[:x.find('\r')+1] if x.find('\r') != -1 else x)
data['Title'].unique()

In [None]:
def cut_text(text_str, text_tl):
    if text_str.lower().find(text_tl.lower()) != -1:
        lng = text_str.lower().find(text_tl.lower())
        text_str = text_str[lng:]
    else:
        text_str
    return text_str

In [None]:
data['Text'] = data.apply(lambda x: cut_text(x['Text'], x['Title']),  axis=1)
data.head()

In [None]:
data['Language'] = data['Text'].apply(lambda x: TextBlob(x.lower()[:100]).detect_language())
data.head()

In [None]:
data['Language'].value_counts()

In [None]:
data.drop(data.index[data['Language'] != 'en'], inplace = True)

In [None]:
data['Language'].value_counts()

In [None]:
#data.to_excel("Project Gutenberg Texts.xlsx", header=True)

In [None]:
philo = pd.read_csv('1000-word-philosophy.csv')
print(philo.info())
philo.drop(['Unnamed: 0'], axis=1, inplace=True)
philo.dropna(subset=['Text'], inplace=True)
philo.reset_index(drop=True, inplace=True)
print(philo.info())
philo.head(5)

In [None]:
import re
for i in range(0, 6):
    #print(philo['Text'][i][:600])
    sub, aut = len('Submissions '), len('An Introductory Anthology Author:')
    sub0, aut0 = philo['Text'][i].find('Submissions '), philo['Text'][i].find('An Introductory Anthology Author:')
    beg, beg0 = sub0+sub+1, aut0+aut+1
    end, end0 = philo['Text'][i].find('~ 1000 Word Philosophy'), philo['Text'][i].find('Cate')
    if beg != -1 and end != -1 and beg0 != 0 and end0 !=0:
        s = philo['Text'][i][beg:end]
        print("\t", s)
        m = re.search(r"\d", s)
        # Title
        print(s[:m.start()])
        # Date
        print(s[m.start():])
        # Author
        print(philo['Text'][i][beg0:end0])

In [None]:
# Sep
# 'Words:', 'Word count:', ')'
for i in range(0, 4):
    print("\t")
    print(philo['Text'][i][:1100])

In [None]:
# Sep
# 'Words:', 'Word count:', ')'
for i in range(46, 49):
    print("\t")
    print(philo['Text'][i][:1100])