In [42]:
from my_functions import text_reader
import pandas as pd
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.manifold import TSNE
import numpy as np
from sklearn.preprocessing import Normalizer, StandardScaler
from textblob import TextBlob

In [43]:
def clean_sentences(lines):
    """Remove numbers and punctuation, and standardize case

    Keyword Arguments:
    lines: string of text"""

    # import
    import re
    from nltk.corpus import stopwords

    # create set of stop words
    stop = set(stopwords.words('english'))

    lower_characters = lines.lower()
    approved_words = []
    white_list = set('abcdefghijklmnopqrstuvwxyz ')

    for word in lower_characters.split():
        if word not in stop:
            clean_word = re.sub(r'[^a-z ]+', '', word)
            approved_words.append(clean_word)
    return approved_words

In [44]:
def clean_remove_stops(lines):
    """Remove numbers and punctuation, and standardize case

    Keyword Arguments:
    lines: string of text"""

    # import
    import re
    from nltk.corpus import stopwords

    # create set of stop words
    stop = set(stopwords.words('english'))

    lower_characters = lines.lower()
    approved_words = []
    white_list = set('abcdefghijklmnopqrstuvwxyz ')

    for word in lower_characters.split():
        if word not in stop:
            clean_word = re.sub(r'[^a-z ]+', '', word)
            approved_words.append(clean_word)
    return " ".join(approved_words)

In [45]:
def print_top_words (model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [46]:
df = pd.read_csv("/Users/BaileyDanielson/Documents/Python/NLP_Practice/final_df.csv")

In [47]:
df.head()

Unnamed: 0,Sentence,Section,Book_Title,Author,Index,Polarity,Subjectivity,Word_Count
0,Coetzee In the Heart of the Country.,0,In the Heart of the Country,J.M. Coetzee,0,0.0,0.0,7
1,Today my father brought home his new bride.,1,In the Heart of the Country,J.M. Coetzee,1,0.136364,0.454545,8
2,They came clip-clop across the flats in a dog...,1,In the Heart of the Country,J.M. Coetzee,2,-0.225,0.5,25
3,Or perhaps they were drawn by two plumed donk...,1,In the Heart of the Country,J.M. Coetzee,3,0.0,1.0,13
4,My father wore his black swallowtail coat and...,1,In the Heart of the Country,J.M. Coetzee,4,-0.115079,0.239683,24


# get list of sentences

In [48]:
# for one book only: 
IHC_sentences = list(df[df["Book_Title"] == "In the Heart of the Country"]["Sentence"])
IHC_sentences[10]

'And then, for a third, there is the new wife, who lies late abed.'

In [49]:
SL_sentences = list(df[df["Book_Title"] == "Shadow Lines"]["Sentence"])
SL_sentences[10]

'It wasn’t easy, for to me he looked old, impossibly old, and I could not remember him looking anything other than old – though, in  fact, at that time he could not have been much older than twenty-nine.'

In [50]:
sentences = list(df["Sentence"])

In [51]:
sentences[10]

'And then, for a third, there is the new wife, who lies late abed.'

In [52]:
clean_sentences = [clean_remove_stops(sent) for sent in sentences]
clean_sentences[10]

'then third new wife lies late abed'

In [53]:
vectorizer = CountVectorizer()

In [54]:
sent_vector = vectorizer.fit_transform(clean_sentences)

In [55]:
np.shape(sent_vector)

(9558, 12059)

In [56]:
nmf = NMF(n_components=2, random_state=42)

dtm_nmf = nmf.fit_transform(sent_vector)
dtm_nmf = Normalizer(copy=False).fit_transform(dtm_nmf)

In [57]:
print_top_words(nmf, vectorizer.get_feature_names(), 100)


Topic #0:
would one like could back me see house time us room day it her perhaps old tridib way come go know ila say father away eyes tell him even never little look around much still think though head may man nothing face long must knew grandmother something people last them went then going two mother first told black came years later hands find road left world without hendrik end that whether words every now ever night door behind take hand looked get calcutta life didnt place used woman bed always days seemed moment another make hair often made myself home

Topic #1:
said ila dont know its me grandmother go may yes right tridib thats head it robi well im you voice now no come going nick that course father there oh little youre hand mrs told time here must then smiling cant remember ill mother turned theres first looking please old price look on wasnt nothing her tell much hes got face calcutta thing think us something didnt mayadebi later him see laughing take malik back story like

In [58]:
IHC_clean = [clean_remove_stops(sent) for sent in IHC_sentences]

In [59]:
SL_clean = [clean_remove_stops(sent) for sent in SL_sentences]

In [60]:
IHC_vector = vectorizer.fit_transform(IHC_clean)

In [61]:
SL_vector = vectorizer.fit_transform(SL_clean)

In [62]:
nmf1 = NMF(n_components=5, random_state=42)

dtm1_nmf = nmf1.fit_transform(IHC_vector)
dtm1_nmf = Normalizer(copy=False).fit_transform(dtm1_nmf)

In [63]:
print_top_words(nmf1, vectorizer.get_feature_names(), 30)


Topic #0:
conakry lutfullah opposite lay jumps screens gazette statesman celebratory hid discovering kindness kissed barking curls label hostel floundering averted famine complaining stern calm edition halfhour officials ducked country alphabet alan

Topic #1:
inhabit step froze heater rushed bore provoke label edition sipping greying sombre screens been fragrance halfhour eighteenthcentury celebratory kindness inching bachao argument allowed forming indians storm arms pale backboneless betrayed

Topic #2:
stiff house been recalled scheme barking income church doubtful ducked dust sombre greying hundreds forks linger flown pale stay hotelier pealed soft fraud fuselage fuss schoolwork defeated saying medium pouting

Topic #3:
leading arms jumps halfhour santoshpur conakry sparkled intimacy jutting conceived inching saris idea step alphabet argument favourite bore say stay pealed leads edition feast majority squirmed saying indeterminate stern retired

Topic #4:
france allowed ducked st

In [64]:
nmf2 = NMF(n_components=3, random_state=42)

dtm2_nmf = nmf2.fit_transform(SL_vector)
dtm2_nmf = Normalizer(copy=False).fit_transform(dtm2_nmf)

In [65]:
print_top_words(nmf2, vectorizer.get_feature_names(), 30)


Topic #0:
could like back see tridib one us ila house her room time me tell way may around it old little much grandmother told father later go eyes know look went

Topic #1:
said dont its know go yes grandmother right thats ila im well robi no voice head me may you it now course come going oh that youre there father hand

Topic #2:
would say go often look grandmother people ask try knew going come think it sometimes him away wonder turn get house always whether every hurry hands find know first happened



In [66]:
sample = "would say go often look grandmother people ask try knew going come think it sometimes him away wonder turn get house always whether every hurry hands find know first happened"