In [96]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
import string
import spacy

In [2]:
# Import the corpus
from nltk.corpus import brown, stopwords

brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [3]:
# Create a DataFrame
df = pd.DataFrame()

In [4]:
# Populate the DataFrame with the fileids
df['ID'] = brown.fileids()

In [5]:
df['ID'].head()

0    ca01
1    ca02
2    ca03
3    ca04
4    ca05
Name: ID, dtype: object

In [6]:
# Populate the DataFrame with the text

text = [] # Create an empty list where the text can be stored
for each in df['ID']:
    txt = brown.words(fileids = [each]) # This returns the text based on the fileid 
    txt = [i.lower() for i in txt]
    txt = ' '.join(txt)
    text.append(txt) # Add the text to the list   

df['text'] = text # Create a new column in the DataFrame

In [7]:
df.head()

Unnamed: 0,ID,text
0,ca01,the fulton county grand jury said friday an in...
1,ca02,"austin , texas -- committee approval of gov. p..."
2,ca03,several defendants in the summerdale police bu...
3,ca04,oslo the most positive element to emerge from ...
4,ca05,east providence should organize its civil defe...


In [8]:
# Populate the DataFrame with the genre of the text

categories = []
for each in df['ID']:
    cat = brown.categories(fileids = [each])
    cat = ' '.join(cat)
    categories.append(cat)

df['genre'] = categories

df.head() # Take a look at the DataFrame to see what we have.

Unnamed: 0,ID,text,genre
0,ca01,the fulton county grand jury said friday an in...,news
1,ca02,"austin , texas -- committee approval of gov. p...",news
2,ca03,several defendants in the summerdale police bu...,news
3,ca04,oslo the most positive element to emerge from ...,news
4,ca05,east providence should organize its civil defe...,news


# Clustering

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'),
                            max_df=0.5,
                            min_df=0.1,
                            lowercase=False)

In [10]:
labels = df.genre
true_k = np.unique(labels).shape[0]

In [11]:
X = tfidf_vectorizer.fit_transform(df['text'])

In [12]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

svd = TruncatedSVD(true_k)
lsa = make_pipeline(svd,Normalizer(copy=False))

In [13]:
X = lsa.fit_transform(X)

In [14]:
from sklearn.cluster import KMeans

km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100)
km.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=15, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [15]:
km.labels_

array([ 7,  9,  3, 14, 10,  7, 14, 14,  3,  8, 12, 12, 12, 12, 12,  4,  4,
        4,  3,  6,  4,  4,  4, 10,  3,  3,  3,  3,  4,  4,  8, 12,  4, 10,
        9,  7, 14, 12, 12, 12, 14, 14,  3,  3,  7, 14, 10, 14, 10, 14, 14,
        8, 10,  1, 14, 10,  1, 10,  3,  9, 14, 14, 14, 14, 14, 14, 14,  1,
       14, 14, 14,  1,  1,  1,  1,  1,  1,  1,  1, 10,  1, 10,  1,  1, 10,
        6, 14,  3, 11,  8,  8,  8,  8,  8,  8, 11,  8, 11, 11, 11,  2,  8,
       11,  8,  8, 10,  2, 14,  1,  4,  3,  2,  5, 12, 12,  6,  6, 13,  2,
        2,  2,  2,  6,  2,  2,  2,  1,  1,  9,  5,  9,  3,  3,  9,  3,  3,
        3,  3,  5,  3,  3,  1,  6, 11,  9,  2,  1, 11,  1,  6,  4,  2,  1,
        3, 14, 11,  6,  6,  6, 11,  6,  2,  6, 14,  6,  9,  2,  2,  6,  4,
        9,  4,  9,  9,  3,  2, 13,  8, 12, 11,  3,  8,  6,  1, 11,  6,  7,
        9, 11,  6, 14, 14, 13,  2,  1, 14,  6, 11,  7, 11,  1, 11, 13,  1,
       11,  6,  1, 11,  5, 14, 11, 11, 11, 11,  1,  6,  6, 11,  5,  1,  4,
        1,  1, 14, 10,  6

In [16]:
from sklearn import metrics

# metric of label giving ground truth
metrics.homogeneity_score(labels, km.labels_)

0.39266299793743681

In [17]:
metrics.completeness_score(labels, km.labels_)

0.3819388602370859

In [18]:
metrics.v_measure_score(labels, km.labels_)

0.38722669280174105

In [19]:
# rand index adjusted for change
metrics.adjusted_rand_score(labels, km.labels_)

0.16796493654276209

In [20]:
df['k'] = km.labels_

pd.crosstab(df['k'], df['genre'])

genre,adventure,belles_lettres,editorial,fiction,government,hobbies,humor,learned,lore,mystery,news,religion,reviews,romance,science_fiction
k,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,12,3,0,17,0,0,4,0,0,14,0,0,0,24,3
1,0,13,3,1,0,3,4,9,5,0,0,0,11,0,1
2,0,1,0,0,1,9,0,4,6,0,0,1,0,0,0
3,0,0,1,0,8,9,0,5,3,0,9,0,1,0,0
4,0,3,0,1,0,1,0,0,3,2,9,0,0,1,0
5,0,2,0,0,1,3,0,33,0,0,0,0,0,0,0
6,0,13,0,0,0,3,0,2,11,0,1,0,1,0,1
7,0,1,1,0,12,0,0,4,1,0,3,0,0,0,0
8,0,1,1,3,0,0,0,0,2,0,2,10,0,1,0
9,0,2,1,0,1,3,0,6,6,0,2,0,0,0,0


In [21]:
original_space_centroids = svd.inverse_transform(km.cluster_centers_)
order_centroids = original_space_centroids.argsort()[:, ::-1]
terms = tfidf_vectorizer.get_feature_names()

In [22]:
for i in range(true_k):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :20]:
        print(' %s' % terms[ind], end='')
    print('\n')

Cluster 0: got eyes went thought knew door looked room going told something house face think felt asked woman mother let us

Cluster 1: music art love miss us young performance mother century form book though york experience stage american mr sense english often

Cluster 2: water surface used feet system small state head shown air area body earth light side pressure ground half size 10

Cluster 3: per 000 year industry cost program development system company million 1960 car market service equipment business cent government rate production

Cluster 4: mrs mr house president miss car white room home year family club woman police door wife school office children mother

Cluster 5: surface used system pressure shown number volume type water systems per information material method study range low small obtained possible

Cluster 6: south war north southern john river american company england wrote west english east states 000 century city year united york

Cluster 7: state states federal c

# Unsupervised Learning

In [23]:
from sklearn.model_selection import train_test_split

# Split the data into training and test sets

text_train, text_test, categories_train, categories_test = train_test_split(
    text, categories, test_size=0.25, random_state=42, stratify=categories)

In [24]:
pd.DataFrame(categories_test)[0].value_counts()

learned            20
belles_lettres     19
lore               12
news               11
hobbies             9
government          8
fiction             7
romance             7
editorial           7
adventure           7
mystery             6
religion            4
reviews             4
science_fiction     2
humor               2
Name: 0, dtype: int64

In [25]:
# Use sklearn.tfidfvectorizer to vectorize the data

vectorizer = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the paragraphs
                             min_df=2, # only use words that appear at least twice
                             stop_words='english', 
                             lowercase=False, #convert everything to lower case 
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )


#Applying the vectorizer
train_tfidf=vectorizer.fit_transform(text_train)
print("Number of features: %d" % train_tfidf.get_shape()[1])

test_tfidf = vectorizer.transform(text_test)
print("Number of features: %d" % test_tfidf.get_shape()[1])

Number of features: 18959
Number of features: 18959


In [28]:
# Run a supervised model on the vectorized data

from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
fit = rfc.fit(train_tfidf, categories_train)
predict = rfc.predict(test_tfidf)
score = rfc.score(test_tfidf, categories_test)

In [29]:
print('Random Forest Classifier Score: {}'.format(score))

Random Forest Classifier Score: 0.44


In [31]:
# Cross-validate

vector_text = vectorizer.fit_transform(text)

from sklearn.model_selection import cross_val_score

cross_val_rfc = cross_val_score(rfc, vector_text, categories)

In [32]:
print('Cross Validated Scores: {}'.format(cross_val_rfc))
print("Accuracy: %0.2f (+/- %0.2f)" % (cross_val_rfc.mean(), cross_val_rfc.std() * 2))

Cross Validated Scores: [ 0.31360947  0.25443787  0.29012346]
Accuracy: 0.29 (+/- 0.05)


These results are pretty awful.  I am going to analyze the data to see where the models are making mistakes and then adjust the data hopefully make the models more accurate.  

In [33]:
# Examine how many titles are in each genre

print(df['genre'].value_counts())

learned            80
belles_lettres     75
lore               48
news               44
hobbies            36
government         30
fiction            29
romance            29
adventure          29
editorial          27
mystery            24
religion           17
reviews            17
humor               9
science_fiction     6
Name: genre, dtype: int64


In [37]:
# We will drop the bottom four genres because there are not enough titles to adequately
# train the models to recognize them

df1 = df # create a new DataFrame


drop_list = ['science_fiction', 'humor', 'reviews', 'religion']
for each in drop_list:
    df1 = df1[df1.genre != each]

print(df1['genre'].value_counts())

learned           80
belles_lettres    75
lore              48
news              44
hobbies           36
government        30
fiction           29
romance           29
adventure         29
editorial         27
mystery           24
Name: genre, dtype: int64


In [39]:
# Create variables for the text and genre in the new DataFrame
text1 = df1['text']
categories1 = df1['genre']

In [40]:
# Split the data into training and test sets

text_train1, text_test1, categories_train1, categories_test1 = train_test_split(
    text1, categories1, test_size=0.25, random_state=42, stratify=categories1)

In [41]:
#Applying the vectorizer
train_tfidf1 = vectorizer.fit_transform(text_train1)
print("Number of features: %d" % train_tfidf1.get_shape()[1])

test_tfidf1 = vectorizer.transform(text_test1)
print("Number of features: %d" % test_tfidf1.get_shape()[1])

Number of features: 17616
Number of features: 17616


In [42]:
# Run a supervised model on the vectorized data

from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
fit1 = rfc.fit(train_tfidf1, categories_train1)
predict1 = rfc.predict(test_tfidf1)
score1 = rfc.score(test_tfidf1, categories_test1)

In [43]:
print('Random Forest Classifier Score: {}'.format(score1))

Random Forest Classifier Score: 0.40707964601769914


In [44]:
# Cross-validate

vector_text1 = vectorizer.fit_transform(text1)

cross_val_rfc1 = cross_val_score(rfc, vector_text1, categories1)

In [45]:
print('Cross Validated Scores: {}'.format(cross_val_rfc1))
print("Accuracy: %0.2f (+/- %0.2f)" % (cross_val_rfc1.mean(), cross_val_rfc1.std() * 2))

Cross Validated Scores: [ 0.34210526  0.30263158  0.27891156]
Accuracy: 0.31 (+/- 0.05)


It appears the model has gotten fractionally better, but it is still pretty awful.  After further examination of the clustered data above, it appears that adventure, fiction, mystery, and romance are frequently clustered together.  We will combine those categories under the genre 'fiction' and rerun the model and hope for increased accuracy.

In [304]:
# Create a new DataFrame

df2 = df1

# Rename all adventure, mystery, and romance titles
replace_list = ['adventure', 'mystery', 'romance']
for each in replace_list:
    df2 = df2.replace(each, 'fiction')

print(df2['genre'].value_counts())

fiction           111
learned            80
belles_lettres     75
lore               48
news               44
hobbies            36
government         30
editorial          27
Name: genre, dtype: int64


In [48]:
# Create variables for the text and genre in the new DataFrame
text2 = df2['text']
categories2 = df2['genre']

In [49]:
# Split the data into training and test sets

text_train2, text_test2, categories_train2, categories_test2 = train_test_split(
    text2, categories2, test_size=0.25, random_state=42, stratify=categories2)

In [50]:
#Applying the vectorizer
train_tfidf2 = vectorizer.fit_transform(text_train2)
print("Number of features: %d" % train_tfidf2.get_shape()[1])

test_tfidf2 = vectorizer.transform(text_test2)
print("Number of features: %d" % test_tfidf2.get_shape()[1])

Number of features: 17461
Number of features: 17461


In [51]:
# Run a supervised model on the vectorized data

fit2 = rfc.fit(train_tfidf2, categories_train2)
predict2 = rfc.predict(test_tfidf2)
score2 = rfc.score(test_tfidf2, categories_test2)

In [52]:
print('Random Forest Classifier Score: {}'.format(score2))

Random Forest Classifier Score: 0.48672566371681414


In [53]:
# Cross-validate

vector_text2 = vectorizer.fit_transform(text2)

cross_val_rfc2 = cross_val_score(rfc, vector_text2, categories2)

In [54]:
print('Cross Validated Scores: {}'.format(cross_val_rfc2))
print("Accuracy: %0.2f (+/- %0.2f)" % (cross_val_rfc2.mean(), cross_val_rfc2.std() * 2))

Cross Validated Scores: [ 0.53642384  0.43046358  0.4966443 ]
Accuracy: 0.49 (+/- 0.09)


Combining adventure, fiction, mystery, and romance resulted in greatly increased model, however the model is still under fifty percent accurate.  More improvements need to be made.

In [55]:
# Lets try adding a stemmer and tokenizer to the vectorizer.

from nltk.stem import PorterStemmer
import string

stemmer = PorterStemmer()

def tokenize_and_stem(text):
    tokens = nltk.tokenize.word_tokenize(text)
    
    # strip out punctuation and make lowercase
    tokens = [token.lower().strip(string.punctuation)
             for token in tokens if token.isalnum()]
    
    # now stem the tokens
    tokens = [stemmer.stem(token) for token in tokens]
    
    return tokens

In [57]:
# Add the tokenize_and_stem function to the vectorizer

vectorizer2 = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the paragraphs
                             min_df=2, # only use words that appear at least twice
                             stop_words='english', 
                             lowercase=False, #convert everything to lower case 
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True, #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                             tokenizer = tokenize_and_stem
                            )

In [58]:
#Applying the vectorizer
train_tfidf3 = vectorizer2.fit_transform(text_train2)
print("Number of features: %d" % train_tfidf2.get_shape()[1])

test_tfidf3 = vectorizer2.transform(text_test2)
print("Number of features: %d" % test_tfidf2.get_shape()[1])

Number of features: 17461
Number of features: 17461


In [59]:
# Run a supervised model on the vectorized data

fit3 = rfc.fit(train_tfidf3, categories_train2)
predict3 = rfc.predict(test_tfidf3)
score3 = rfc.score(test_tfidf3, categories_test2)

In [60]:
print('Random Forest Classifier Score: {}'.format(score3))

Random Forest Classifier Score: 0.4778761061946903


In [61]:
# Cross-validate

vector_text3 = vectorizer2.fit_transform(text2)

cross_val_rfc3 = cross_val_score(rfc, vector_text3, categories2)

In [62]:
print('Cross Validated Scores: {}'.format(cross_val_rfc3))
print("Accuracy: %0.2f (+/- %0.2f)" % (cross_val_rfc3.mean(), cross_val_rfc3.std() * 2))

Cross Validated Scores: [ 0.53642384  0.40397351  0.48322148]
Accuracy: 0.47 (+/- 0.11)


Adding the tokenize_and_stem function to the vectorizer actually made the model worse.  Looks like I need to find another solution.

In [63]:
# Lets try different supervised learning models and see if we can find a more accurate model
# RandomForestClassifier

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [64]:
# Run a support vector machine classifier

svc = SVC(kernel = 'linear')

fit_svc = svc.fit(train_tfidf2, categories_train2)
predict_svc = svc.predict(test_tfidf2)
score_svc = svc.score(test_tfidf2, categories_test2)

In [65]:
print('Support Vector Classifier Score: {}'.format(score_svc))

Support Vector Classifier Score: 0.6371681415929203


In [67]:
# Cross-validate

vector_text_svc = vectorizer.fit_transform(text2)

cross_val_svc = cross_val_score(svc, vector_text2, categories2)

In [68]:
print('Cross Validated Scores: {}'.format(cross_val_svc))
print("Accuracy: %0.2f (+/- %0.2f)" % (cross_val_svc.mean(), cross_val_svc.std() * 2))

Cross Validated Scores: [ 0.62913907  0.53642384  0.55704698]
Accuracy: 0.57 (+/- 0.08)


In [69]:
# Run a Logistic Regression

lr = LogisticRegression()
fit_lr = lr.fit(train_tfidf2, categories_train2)
predict_lr = lr.predict(test_tfidf2)
score_lr = lr.score(test_tfidf2, categories_test2)

In [70]:
print('Logistic Regression Score: {}'.format(score_lr))

Logistic Regression Score: 0.4778761061946903


In [71]:
# Cross-validate

vector_text_lr = vectorizer.fit_transform(text2)

cross_val_lr = cross_val_score(lr, vector_text2, categories2)

In [72]:
print('Cross Validated Scores: {}'.format(cross_val_lr))
print("Accuracy: %0.2f (+/- %0.2f)" % (cross_val_lr.mean(), cross_val_lr.std() * 2))

Cross Validated Scores: [ 0.52980132  0.42384106  0.46308725]
Accuracy: 0.47 (+/- 0.09)


In [74]:
# Run a K Nearest Neighbors Classifier

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
fit_knn = knn.fit(train_tfidf2, categories_train2)
predict_knn = knn.predict(test_tfidf2)
score_knn = knn.score(test_tfidf2, categories_test2)

In [75]:
print('KNN Classifier Score: {}'.format(score_knn))

KNN Classifier Score: 0.6460176991150443


In [76]:
# Cross-validate

vector_text_knn = vectorizer.fit_transform(text2)

cross_val_knn = cross_val_score(knn, vector_text2, categories2)

In [77]:
print('Cross Validated Scores: {}'.format(cross_val_knn))
print("Accuracy: %0.2f (+/- %0.2f)" % (cross_val_knn.mean(), cross_val_knn.std() * 2))

Cross Validated Scores: [ 0.60927152  0.52980132  0.52348993]
Accuracy: 0.55 (+/- 0.08)


The supervised model that outputted the best results using the vectorized data was the KNN Classifier with a score of 0.646 and a cross-validated score of 0.55 (+/- 0.08).  However, this is still a very poorly performing model.  In order to try to continue to improve the performance of the models, I am going to add more features using the bag-of-words method.  

In [88]:
# Try using a bag-of-words vectorizer--CountVectorizer()

from sklearn.feature_extraction.text import CountVectorizer

count_vec = CountVectorizer(stop_words = 'english',
                           max_df = 0.5,
                           min_df = 2,
                           max_features = 20)

#Applying the vectorizer
train_countvec = count_vec.fit_transform(text_train2)
print("Number of features: %d" % train_countvec.get_shape()[1])

test_countvec = count_vec.transform(text_test2)
print("Number of features: %d" % test_countvec.get_shape()[1])

Number of features: 20
Number of features: 20


In [119]:
text2.head()

0    the fulton county grand jury said friday an in...
1    austin , texas -- committee approval of gov. p...
2    several defendants in the summerdale police bu...
3    oslo the most positive element to emerge from ...
4    east providence should organize its civil defe...
Name: text, dtype: object

In [124]:
rar = nlp(text2[2])

several defendants in the summerdale police burglary trial made statements indicating their guilt at the time of their arrest , judge james b. parsons was told in criminal court yesterday . the disclosure by charles bellows , chief defense counsel , startled observers and was viewed as the prelude to a quarrel between the six attorneys representing the eight former policemen now on trial . bellows made the disclosure when he asked judge parsons to grant his client , alan clements , 30 , a separate trial . bellows made the request while the all-woman jury was out of the courtroom . fears prejudicial aspects `` the statements may be highly prejudicial to my client '' , bellows told the court . `` some of the defendants strongly indicated they knew they were receiving stolen property . it is impossible to get a fair trial when some of the defendants made statements involving themselves and others '' . judge parsons leaned over the bench and inquired , `` you mean some of the defendants ma

In [221]:
# Utility function to create a list of the 10 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(11)]



In [130]:
rar = nlp(df2['text'][0])
bag_of_words(rar)

['``',
 'say',
 'jury',
 'have',
 'county',
 'fulton',
 'election',
 'department',
 'state',
 'fund',
 'city',
 'vote',
 'bond',
 'resolution',
 'mayor',
 'georgia',
 'highway',
 'atlanta',
 'william',
 'court']

In [242]:
rar_list = []
for each in df2['text']:
    rar_list.append(each)
    
rar_list[0]



In [243]:
rar_list1 = []
for each in rar_list:
    t = nlp(each)
    rar_list1.append(t)

TypeError: Argument 'string' has incorrect type (expected str, got int)

In [151]:
common_words = []
for each in rar_list1:
    w = bag_of_words(each)
    for words in w:
        common_words.append(words)

In [154]:
common_words = set(common_words)
len(common_words)

3157

In [156]:
df2['genre'].value_counts()

fiction           111
learned            80
belles_lettres     75
lore               48
news               44
hobbies            36
government         30
editorial          27
Name: genre, dtype: int64

In [312]:
# Create DataFrames that isolate texts be genre

# fiction
df_fic = df2
df_fic = df_fic[df_fic.genre == 'fiction' ]

# learned
df_learn = df2
df_learn = df_learn[df_learn.genre == 'learned']

# belles_lettres
df_bell = df2
df_bell = df_bell[df_bell.genre == 'belles_lettres']

# lore
df_lore = df2
df_lore = df_lore[df_lore.genre == 'lore']

# news
df_news = df2
df_news = df_news[df_news.genre == 'news']

# hobbies
df_hob = df2
df_hob = df_hob[df_hob.genre == 'hobbies']

# government
df_gov = df2
df_gov = df_gov[df_gov.genre == 'government']

# editorial
df_ed = df2
df_ed = df_ed[df_ed.genre == 'editorial']

In [313]:
# Remove the `` from the text because it pops up as one of the most common words for each
# genre.
# Combine all the texts from each genre. Parse each newly created text.

df_fic['text'] = df_fic['text'].str.replace('``', '')
fiction = ''.join(df_fic['text'].tolist())
fiction = nlp(fiction)

df_learn['text'] = df_learn['text'].str.replace('``', '')
learned = ''.join(df_learn['text'].tolist())
learned = nlp(learned)

df_bell['text'] = df_bell['text'].str.replace('``', '')
belles = ''.join(df_bell['text'].tolist())
belles = nlp(belles)

#lore
df_lore['text'] = df_lore['text'].str.replace('``', '')
lore = ''.join(df_lore['text'].tolist())
lore = nlp(lore)

# news
df_news['text'] = df_news['text'].str.replace('``', '')
news = ''.join(df_news['text'].tolist())
news = nlp(news)

#hob
df_hob['text'] = df_hob['text'].str.replace('``', '')
hobbies = ''.join(df_hob['text'].tolist())
hobbies = nlp(hobbies)

df_gov['text'] = df_gov['text'].str.replace('``', '')
government= ''.join(df_gov['text'].tolist())
government = nlp(government)

df_ed['text'] = df_ed['text'].str.replace('``', '')
editorial = ''.join(df_ed['text'].tolist())
editorial = nlp(editorial)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/ind

In [314]:
# Run all the parsed texts through the bag-of-words function to find the most common words
# for each genre.

fiction_bow = bag_of_words(fiction)
learned_bow = bag_of_words(learned)
belles_bow = bag_of_words(belles)
lore_bow = bag_of_words(lore)
news_bow = bag_of_words(news)
hobbies_bow = bag_of_words(hobbies)
government_bow = bag_of_words(government)
editorial_bow = bag_of_words(editorial)

In [315]:
# Combine the bags to create a set of unique words

common_words = set(fiction_bow + learned_bow + belles_bow + lore_bow + news_bow + hobbies_bow + government_bow + editorial_bow)

In [316]:
# Parse each of the texts individually

# Fiction
rar_list = []
for each in df_fic['text']:
    rar_list.append(each)
rar_list1 = []
for each in rar_list:
    t = nlp(each)
    rar_list1.append(t)
df_fic['text'] = rar_list1

# Learned
star_list = []
for each in df_learn['text']:
    star_list.append(each)
sun_list = []
for each in star_list:
    t = nlp(each)
    sun_list.append(t)
df_learn['text'] = sun_list

# Belles Lettres
star_list1 = []
for each in df_bell['text']:
    star_list1.append(each)
sun_list1 = []
for each in star_list1:
    t = nlp(each)
    sun_list1.append(t)
df_bell['text'] = sun_list1

# Lore
star_list2 = []
for each in df_lore['text']:
    star_list2.append(each)
sun_list2 = []
for each in star_list2:
    t = nlp(each)
    sun_list2.append(t)
df_lore['text'] = sun_list2

# News
star_list3 = []
for each in df_news['text']:
    star_list3.append(each)
sun_list3 = []
for each in star_list3:
    t = nlp(each)
    sun_list3.append(t)
df_news['text'] = sun_list3

# Hobbies
star_list4 = []
for each in df_hob['text']:
    star_list4.append(each)
sun_list4 = []
for each in star_list4:
    t = nlp(each)
    sun_list4.append(t)
df_hob['text'] = sun_list4

# Government
star_list5 = []
for each in df_gov['text']:
    star_list5.append(each)
sun_list5 = []
for each in star_list5:
    t = nlp(each)
    sun_list5.append(t)
df_gov['text'] = sun_list5

# Editorials
star_list6 = []
for each in df_ed['text']:
    star_list6.append(each)
sun_list6 = []
for each in star_list6:
    t = nlp(each)
    sun_list6.append(t)
df_ed['text'] = sun_list6

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stabl

In [333]:
df_new = pd.DataFrame()

In [334]:
df_new

In [335]:
df_new = pd.concat([df_fic, df_learn, df_bell, df_lore, df_news, df_hob, df_gov, df_ed])

In [339]:
df_new.head(20)

Unnamed: 0,ID,text,genre,k
374,ck01,"(thirty, -, three, scotty, did, not, go, back,...",fiction,0
375,ck02,"(where, their, sharp, edges, seemed, restless,...",fiction,13
376,ck03,"(mickie, sat, over, his, second, whisky, -, on...",fiction,0
377,ck04,"(the, bishop, looked, at, him, coldly, and, sa...",fiction,8
378,ck05,"(payne, dismounted, in, madison, place, and, h...",fiction,0
379,ck06,"(with, a, sneer, ,, the, man, spread, his, leg...",fiction,8
380,ck07,"(if, the, crummy, bastard, could, write, !, !,...",fiction,0
381,ck08,"(rousseau, is, so, persuasive, that, voltaire,...",fiction,1
382,ck09,"(it, was, the, first, time, any, of, us, had, ...",fiction,13
383,ck10,"(that, summer, the, gambling, houses, were, cl...",fiction,8


In [340]:
# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df_bow = pd.DataFrame(columns=common_words)
    df_bow['text'] = df_new['text']
    df_bow.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, corp in enumerate(df_bow['text']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in corp
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df_bow.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 500 == 0:
            print("Processing row {}".format(i))
            
    return df_bow


In [None]:
for word in words:
    df_bow.loc[i, word] += 1

In [341]:
# Create our data frame with features. This can take a while to run.
word_counts = bow_features(common_words)
word_counts.head()

Processing row 0


Unnamed: 0,year,mr,not,world,1,think,point,new,mrs,president,...,place,school,have,system,like,look,come,people,2,text
374,1578,696,1926,701,536,1005,599,1457,526,437,...,769,681,5745,528,1300,958,1433,792,826,"(thirty, -, three, scotty, did, not, go, back,..."
375,1578,696,1926,701,536,1005,599,1457,526,437,...,769,681,5745,528,1300,958,1433,792,826,"(where, their, sharp, edges, seemed, restless,..."
376,1578,696,1926,701,536,1005,599,1457,526,437,...,769,681,5745,528,1300,958,1433,792,826,"(mickie, sat, over, his, second, whisky, -, on..."
377,1578,696,1926,701,536,1005,599,1457,526,437,...,769,681,5745,528,1300,958,1433,792,826,"(the, bishop, looked, at, him, coldly, and, sa..."
378,1578,696,1926,701,536,1005,599,1457,526,437,...,769,681,5745,528,1300,958,1433,792,826,"(payne, dismounted, in, madison, place, and, h..."


384