In [37]:
import pandas as pd
import matplotlib.pyplot as plt
import unicodedata
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.probability import FreqDist
from empath import Empath

%matplotlib inline

nltk.download('stopwords')
nltk.download('vader_lexicon')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\breno\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\breno\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [38]:
df = pd.read_csv('philosophy_data.csv')
df = df[['title', 'author', 'school', 'sentence_lowered', 'original_publication_date', 'sentence_length']]
df['century'] = df['original_publication_date'].apply(lambda x: x // 100 + 1 if x >= 0 else x // 100) 
df.head(5)

Unnamed: 0,title,author,school,sentence_lowered,original_publication_date,sentence_length,century
0,Plato - Complete Works,Plato,plato,"what's new, socrates, to make you leave your ...",-350,125,-4
1,Plato - Complete Works,Plato,plato,surely you are not prosecuting anyone before t...,-350,69,-4
2,Plato - Complete Works,Plato,plato,the athenians do not call this a prosecution b...,-350,74,-4
3,Plato - Complete Works,Plato,plato,what is this you say?,-350,21,-4
4,Plato - Complete Works,Plato,plato,"someone must have indicted you, for you are no...",-350,101,-4


In [39]:
df['title'] = df['title'].str.lower()
df['author'] = df['author'].str.lower()
df['school'] = df['school'].str.lower()
df['sentence_lowered'] = df['sentence_lowered'].str.lower()
df = df.drop_duplicates()
df

Unnamed: 0,title,author,school,sentence_lowered,original_publication_date,sentence_length,century
0,plato - complete works,plato,plato,"what's new, socrates, to make you leave your ...",-350,125,-4
1,plato - complete works,plato,plato,surely you are not prosecuting anyone before t...,-350,69,-4
2,plato - complete works,plato,plato,the athenians do not call this a prosecution b...,-350,74,-4
3,plato - complete works,plato,plato,what is this you say?,-350,21,-4
4,plato - complete works,plato,plato,"someone must have indicted you, for you are no...",-350,101,-4
...,...,...,...,...,...,...,...
360803,"women, race, and class",davis,feminism,but the socialization of housework including m...,1981,142,20
360804,"women, race, and class",davis,feminism,the only significant steps toward endingdomest...,1981,117,20
360805,"women, race, and class",davis,feminism,"working women, therefore, have a special and v...",1981,90,20
360806,"women, race, and class",davis,feminism,"moreover, under capitalism, campaigns for jobs...",1981,199,20


In [40]:
stop_words = stopwords.words('english')
other_stop_words = ['thy', 
                    'one', 
                    'two', 
                    'may', 
                    'would', 
                    'might', 
                    'two', 
                    'also', 
                    'must', 
                    'upon', 
                    'though', 
                    'often', 
                    'therefore',
                    'thus', 
                    'first',
                    'even', 
                    'thou',
                    'ye',
                    'unto',
                    'thee',
                    'either',
                    'doth',
                    'whatsoever',
                    'say',
                    'something',
                    'since',
                    'much',
                    'yet',
                    'another',
                    'every',
                    'without',
                    'merely',
                    'thing',
                    'things',
                    'like',
                    'great',
                    'still',
                    'however',
                    'way',
                    'say',
                    'us',
                    'something',
                    'said',
                    'well',
                    'without',
                    'see',
                    'thyself',
                    'shall',
                    'hath',
                    'many',
                    'anything']
stop_words = stop_words + other_stop_words

In [41]:
def normalize_text(txt):
    return ' '.join([word for word in word_tokenize(txt) if word not in stop_words and word.isalpha()])

df['normalized_sentence'] = df.apply(lambda row: normalize_text(str(row['sentence_lowered'])), axis = 1)
df['normalized_words_count'] = df['normalized_sentence'].apply(lambda word: len(word.split()))
df.head(5)

Unnamed: 0,title,author,school,sentence_lowered,original_publication_date,sentence_length,century,normalized_sentence,normalized_words_count
0,plato - complete works,plato,plato,"what's new, socrates, to make you leave your ...",-350,125,-4,new socrates make leave usual haunts lyceum sp...,12
1,plato - complete works,plato,plato,surely you are not prosecuting anyone before t...,-350,69,-4,surely prosecuting anyone king archon,5
2,plato - complete works,plato,plato,the athenians do not call this a prosecution b...,-350,74,-4,athenians call prosecution indictment euthyphro,5
3,plato - complete works,plato,plato,what is this you say?,-350,21,-4,,0
4,plato - complete works,plato,plato,"someone must have indicted you, for you are no...",-350,101,-4,someone indicted going tell indicted someone else,7


In [42]:
sid = SentimentIntensityAnalyzer()

def get_sentiment_score(text):
    scores = sid.polarity_scores(text)
    return scores['compound']

df['sentiment_score'] = df['sentence_lowered'].apply(lambda x: get_sentiment_score(x))
df

Unnamed: 0,title,author,school,sentence_lowered,original_publication_date,sentence_length,century,normalized_sentence,normalized_words_count,sentiment_score
0,plato - complete works,plato,plato,"what's new, socrates, to make you leave your ...",-350,125,-4,new socrates make leave usual haunts lyceum sp...,12,-0.2960
1,plato - complete works,plato,plato,surely you are not prosecuting anyone before t...,-350,69,-4,surely prosecuting anyone king archon,5,0.4404
2,plato - complete works,plato,plato,the athenians do not call this a prosecution b...,-350,74,-4,athenians call prosecution indictment euthyphro,5,-0.3346
3,plato - complete works,plato,plato,what is this you say?,-350,21,-4,,0,0.0000
4,plato - complete works,plato,plato,"someone must have indicted you, for you are no...",-350,101,-4,someone indicted going tell indicted someone else,7,0.0000
...,...,...,...,...,...,...,...,...,...,...
360803,"women, race, and class",davis,feminism,but the socialization of housework including m...,1981,142,20,socialization housework including meal prepara...,13,0.8462
360804,"women, race, and class",davis,feminism,the only significant steps toward endingdomest...,1981,117,20,significant steps toward endingdomestic slaver...,10,-0.6124
360805,"women, race, and class",davis,feminism,"working women, therefore, have a special and v...",1981,90,20,working women special vital interest struggle ...,7,0.6808
360806,"women, race, and class",davis,feminism,"moreover, under capitalism, campaigns for jobs...",1981,199,20,moreover capitalism campaigns jobs equal basis...,18,0.4939


In [43]:
lexicon = Empath()

In [44]:
lexicon.analyze(df['sentence_lowered'][1], normalize=True)

{'help': 0.0,
 'office': 0.0,
 'dance': 0.0,
 'money': 0.0,
 'wedding': 0.0,
 'domestic_work': 0.0,
 'sleep': 0.0,
 'medical_emergency': 0.0,
 'cold': 0.0,
 'hate': 0.0,
 'cheerfulness': 0.0,
 'aggression': 0.0,
 'occupation': 0.0,
 'envy': 0.0,
 'anticipation': 0.0,
 'family': 0.0,
 'vacation': 0.0,
 'crime': 0.0,
 'attractive': 0.0,
 'masculine': 0.0,
 'prison': 0.0,
 'health': 0.0,
 'pride': 0.0,
 'dispute': 0.0,
 'nervousness': 0.0,
 'government': 0.0,
 'weakness': 0.0,
 'horror': 0.0,
 'swearing_terms': 0.0,
 'leisure': 0.0,
 'suffering': 0.0,
 'royalty': 0.07692307692307693,
 'wealthy': 0.0,
 'tourism': 0.0,
 'furniture': 0.0,
 'school': 0.0,
 'magic': 0.0,
 'beach': 0.0,
 'journalism': 0.0,
 'morning': 0.0,
 'banking': 0.0,
 'social_media': 0.0,
 'exercise': 0.0,
 'night': 0.0,
 'kill': 0.0,
 'blue_collar_job': 0.0,
 'art': 0.0,
 'ridicule': 0.0,
 'play': 0.0,
 'computer': 0.0,
 'college': 0.0,
 'optimism': 0.07692307692307693,
 'stealing': 0.0,
 'real_estate': 0.0,
 'home': 0.0

In [45]:
df['sentence_words_count'] = df['sentence_lowered'].apply(lambda word: len(word.split()))
df

Unnamed: 0,title,author,school,sentence_lowered,original_publication_date,sentence_length,century,normalized_sentence,normalized_words_count,sentiment_score,sentence_words_count
0,plato - complete works,plato,plato,"what's new, socrates, to make you leave your ...",-350,125,-4,new socrates make leave usual haunts lyceum sp...,12,-0.2960,23
1,plato - complete works,plato,plato,surely you are not prosecuting anyone before t...,-350,69,-4,surely prosecuting anyone king archon,5,0.4404,13
2,plato - complete works,plato,plato,the athenians do not call this a prosecution b...,-350,74,-4,athenians call prosecution indictment euthyphro,5,-0.3346,12
3,plato - complete works,plato,plato,what is this you say?,-350,21,-4,,0,0.0000,5
4,plato - complete works,plato,plato,"someone must have indicted you, for you are no...",-350,101,-4,someone indicted going tell indicted someone else,7,0.0000,19
...,...,...,...,...,...,...,...,...,...,...,...
360803,"women, race, and class",davis,feminism,but the socialization of housework including m...,1981,142,20,socialization housework including meal prepara...,13,0.8462,22
360804,"women, race, and class",davis,feminism,the only significant steps toward endingdomest...,1981,117,20,significant steps toward endingdomestic slaver...,10,-0.6124,17
360805,"women, race, and class",davis,feminism,"working women, therefore, have a special and v...",1981,90,20,working women special vital interest struggle ...,7,0.6808,14
360806,"women, race, and class",davis,feminism,"moreover, under capitalism, campaigns for jobs...",1981,199,20,moreover capitalism campaigns jobs equal basis...,18,0.4939,28


In [46]:
'''aux = test['lexical_categories'][1]
aux = aux.replace('"', '%')
aux = aux.replace("'", '"')
aux = aux.replace("%", "'")
aux'''

'aux = test[\'lexical_categories\'][1]\naux = aux.replace(\'"\', \'%\')\naux = aux.replace("\'", \'"\')\naux = aux.replace("%", "\'")\naux'

In [47]:
'''import json
dictionary = json.loads(aux)
dictionary'''

'import json\ndictionary = json.loads(aux)\ndictionary'

In [48]:
df = df[['title', 'author', 'school', 'century', 'sentence_lowered', 'sentence_words_count', 'normalized_sentence', 'sentiment_score']]
df

Unnamed: 0,title,author,school,century,sentence_lowered,sentence_words_count,normalized_sentence,sentiment_score
0,plato - complete works,plato,plato,-4,"what's new, socrates, to make you leave your ...",23,new socrates make leave usual haunts lyceum sp...,-0.2960
1,plato - complete works,plato,plato,-4,surely you are not prosecuting anyone before t...,13,surely prosecuting anyone king archon,0.4404
2,plato - complete works,plato,plato,-4,the athenians do not call this a prosecution b...,12,athenians call prosecution indictment euthyphro,-0.3346
3,plato - complete works,plato,plato,-4,what is this you say?,5,,0.0000
4,plato - complete works,plato,plato,-4,"someone must have indicted you, for you are no...",19,someone indicted going tell indicted someone else,0.0000
...,...,...,...,...,...,...,...,...
360803,"women, race, and class",davis,feminism,20,but the socialization of housework including m...,22,socialization housework including meal prepara...,0.8462
360804,"women, race, and class",davis,feminism,20,the only significant steps toward endingdomest...,17,significant steps toward endingdomestic slaver...,-0.6124
360805,"women, race, and class",davis,feminism,20,"working women, therefore, have a special and v...",14,working women special vital interest struggle ...,0.6808
360806,"women, race, and class",davis,feminism,20,"moreover, under capitalism, campaigns for jobs...",28,moreover capitalism campaigns jobs equal basis...,0.4939


In [56]:
lexicon = Empath()
df['lexical_categories'] = df['sentence_lowered'].apply(lambda sentence: {key: value for key, value in lexicon.analyze(sentence, normalize=False).items() if value != 0})
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['lexical_categories'] = df['sentence_lowered'].apply(lambda sentence: {key: value for key, value in lexicon.analyze(sentence, normalize=False).items() if value != 0})


Unnamed: 0,title,author,school,century,sentence_lowered,sentence_words_count,normalized_sentence,sentiment_score,lexical_categories
0,plato - complete works,plato,plato,-4,"what's new, socrates, to make you leave your ...",23,new socrates make leave usual haunts lyceum sp...,-0.2960,"{'family': 1.0, 'vacation': 1.0, 'leisure': 1...."
1,plato - complete works,plato,plato,-4,surely you are not prosecuting anyone before t...,13,surely prosecuting anyone king archon,0.4404,"{'royalty': 1.0, 'optimism': 1.0, 'medieval': ..."
2,plato - complete works,plato,plato,-4,the athenians do not call this a prosecution b...,12,athenians call prosecution indictment euthyphro,-0.3346,"{'crime': 1.0, 'dispute': 1.0, 'stealing': 1.0..."
3,plato - complete works,plato,plato,-4,what is this you say?,5,,0.0000,{}
4,plato - complete works,plato,plato,-4,"someone must have indicted you, for you are no...",19,someone indicted going tell indicted someone else,0.0000,"{'communication': 1.0, 'order': 1.0, 'speaking..."
...,...,...,...,...,...,...,...,...,...
360803,"women, race, and class",davis,feminism,20,but the socialization of housework including m...,22,socialization housework including meal prepara...,0.8462,"{'help': 1.0, 'money': 1.0, 'domestic_work': 2..."
360804,"women, race, and class",davis,feminism,20,the only significant steps toward endingdomest...,17,significant steps toward endingdomestic slaver...,-0.6124,"{'crime': 1.0, 'dispute': 1.0, 'government': 1..."
360805,"women, race, and class",davis,feminism,20,"working women, therefore, have a special and v...",14,working women special vital interest struggle ...,0.6808,"{'occupation': 1.0, 'weakness': 1.0, 'leisure'..."
360806,"women, race, and class",davis,feminism,20,"moreover, under capitalism, campaigns for jobs...",28,moreover capitalism campaigns jobs equal basis...,0.4939,"{'family': 1.0, 'play': 1.0, 'heroic': 1.0, 'f..."


In [57]:
df.to_csv('data_prep.csv', index=False)

In [62]:
df['school'].unique()

array(['plato', 'aristotle', 'empiricism', 'rationalism', 'analytic',
       'continental', 'phenomenology', 'german_idealism', 'communism',
       'capitalism', 'stoicism', 'nietzsche', 'feminism'], dtype=object)