In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from collections import Counter
from sklearn import ensemble
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

In [2]:
stopwords = stopwords.words('english')
stemmer = SnowballStemmer("english")

# Utility functions
def lemma_tokens(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [3]:
# loading files
filenames = ['against_the_gods', 'battle_through_the_heavens', 'desolate_era', 'emperors_domination', 'martial_god_asura', 'martial_world', 'overgeared', 'praise_the_orc', 'sovereign_of_the_three_realms', 'wu_dong_qian_kun']
raw_files = []

for filename in filenames:
    with open('../dataset/' + filename + '.txt', encoding='utf-8') as myfile:
        raw_files.append(myfile.read())

In [4]:
#use extend so it's a big flat list of vocab
totalvocab_lemma = []
totalvocab_tokenized = []
for doc in raw_files:
    allwords_lemma = lemma_tokens(doc) #for each item in doc, tokenize
    totalvocab_lemma.extend(allwords_lemma) #extend the list to one flat array
    
    allwords_tokenized = tokenize_only(doc)
    totalvocab_tokenized.extend(allwords_tokenized)

In [5]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_lemma)
print ('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame')

there are 302940 items in vocab_frame


In [6]:
vocab_frame.head()

Unnamed: 0,words
﻿yun,﻿yun
che,che
's,'s
conscious,consciousness
gradual,gradually


In [7]:
vectorizer = TfidfVectorizer(max_df=0.75, # drop words that occur in more than 3/4 of the sentence
                             min_df=2, # only use words that appear at least twice
                             stop_words='english', 
                             lowercase=False, #don't convert everything to lower case
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True, #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                             tokenizer=lemma_tokens,
                             ngram_range=(1,3)                             
                            )

# Apply the vectorizer
tfidf_matrix = vectorizer.fit_transform(raw_files)
print(tfidf_matrix.shape)

(10, 11356)


In [8]:
terms = vectorizer.get_feature_names()

In [9]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)

# K Means

In [129]:
from sklearn.cluster import KMeans, MeanShift, estimate_bandwidth, SpectralClustering

num_clusters = 6

km = KMeans(n_clusters=num_clusters)

km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

In [115]:
from sklearn.externals import joblib

#uncomment the below to save your model 
#since I've already run my model I am loading from the pickle

joblib.dump(km,  'doc_cluster.pkl')

km = joblib.load('doc_cluster.pkl')
clusters = km.labels_.tolist()

In [116]:
novels = { 'title': filenames, 'text': raw_files, 'cluster': clusters }

frame = pd.DataFrame(novels, index = [clusters] , columns = ['title', 'text', 'cluster'])

In [46]:
frame['cluster'].value_counts()

3    3
2    2
1    2
0    2
4    1
Name: cluster, dtype: int64

In [117]:
print("Top terms per cluster:")
print('-'*40)

#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    if i != 0:
        print('\n')
    print("Cluster %d words: " % i, end='')
    
    for j, ind in enumerate(order_centroids[i, :10]): #replace 6 with n words per cluster
        if (j == 0):
            print('%s' % vocab_frame.loc[terms[ind].split(' ')].values.tolist()[0][0], end='')
        else:
            print(', %s' % vocab_frame.loc[terms[ind].split(' ')].values.tolist()[0][0], end='')
    print()
    
    print("Cluster %d titles: " % i, end='')
    for j, title in enumerate(frame.loc[i]['title'].values.tolist()):
        if (j == 0):
            print('%s' % title, end='')
        else:
            print(', %s' % title, end='')

Top terms per cluster:
----------------------------------------
Cluster 0 words: quest, warrior, users, blacksmith, greatsword, wolves, game, granting, lord, axe
Cluster 0 titles: overgeared, praise_the_orc

Cluster 1 words: xiao, yan, s, xia, er, clan, t, qi, xiao, cloud
Cluster 1 titles: against_the_gods, battle_through_the_heavens

Cluster 2 words: lin, ming, dong, xiao, wang, tan, yan, seven, lin, yuan
Cluster 2 titles: martial_world, wu_dong_qian_kun

Cluster 3 words: feng, zheng, eastern, yue, qi, pills, grass, alliance, recipe, healing
Cluster 3 titles: martial_god_asura, sovereign_of_the_three_realms

Cluster 4 words: ji, snow, xiantian, clan, lord, cui, immortal, looked, diagram, sword
Cluster 4 titles: desolate_era

Cluster 5 words: li, sects, protector, disciple, saint, emperor, immortal, du, ancient, incense
Cluster 5 titles: emperors_domination

In [79]:
X_train, X_test = train_test_split(tfidf_matrix, test_size =0.4)

# Mean Shift

In [81]:
bandwidth = estimate_bandwidth(X_train.toarray())
ms = MeanShift()
ms.fit(tfidf_matrix.toarray())
clusters = ms.labels_.tolist()
labels = ms.labels_
cluster_centers = ms.cluster_centers_
n_clusters_ = len(np.unique(labels))
print("Number of estimated clusters: {}".format(n_clusters_))

Number of estimated clusters: 1


# Spectral Clustering

In [131]:
n_clusters=5

# Declare and fit the model.
sc = SpectralClustering(n_clusters=n_clusters)
sc.fit(tfidf_matrix)
clusters = km.labels_.tolist()

In [132]:
novels = { 'title': filenames, 'text': raw_files, 'cluster': clusters }

frame = pd.DataFrame(novels, index = [clusters] , columns = ['title', 'text', 'cluster'])

In [138]:
vocab_frame.head(10)

Unnamed: 0,words
﻿yun,﻿yun
che,che
's,'s
conscious,consciousness
gradual,gradually
awaken,awakened
what,what
's,'s
go,going
on,on


In [120]:
print("Top terms per cluster:")
print('-'*40)

#sort cluster centers by proximity to centroid
order_centroids = sc.affinity_matrix_.argsort()[:, ::-1] 

for i in range(num_clusters):
    if i != 0:
        print('\n')
    print("Cluster %d words: " % i, end='')
    
    for j, ind in enumerate(order_centroids[i, :10]): #replace 6 with n words per cluster
        if (j == 0):
            print('%s' % vocab_frame.loc[terms[ind].split(' ')].values.tolist()[0][0], end='')
        else:
            print(', %s' % vocab_frame.loc[terms[ind].split(' ')].values.tolist()[0][0], end='')
    print()
    
    print("Cluster %d titles: " % i, end='')
    for j, title in enumerate(frame.loc[i]['title'].values.tolist()):
        if (j == 0):
            print('%s' % title, end='')
        else:
            print(', %s' % title, end='')

Top terms per cluster:
----------------------------------------
Cluster 0 words: 'a, 'd, 'm, 'd, 'i, 'm, 'm, 'm, 'm, 'm
Cluster 0 titles: overgeared, praise_the_orc

Cluster 1 words: 'd, 'a, 'm, 'm, 'm, 'd, 'm, 'i, 'm, 'm
Cluster 1 titles: against_the_gods, battle_through_the_heavens

Cluster 2 words: 'd, 'm, 'm, 'i, 'm, 'a, 'd, 'm, 'm, 'm
Cluster 2 titles: martial_world, wu_dong_qian_kun

Cluster 3 words: 'i, 'm, 'm, 'd, 'a, 'm, 'm, 'm, 'd, 'm
Cluster 3 titles: martial_god_asura, sovereign_of_the_three_realms

Cluster 4 words: 'm, 'm, 'i, 'd, 'm, 'm, 'a, 'm, 'd, 'm
Cluster 4 titles: desolate_era

Cluster 5 words: 'm, 'm, 'd, 'm, 'a, 'i, 'd, 'm, 'm, 'm
Cluster 5 titles: emperors_domination

In [99]:
order_centroids = sc.affinity_matrix_.argsort()[:, ::-1]

for i in range(num_clusters):
    print("Cluster %d words: " % i, end='')
    for j, ind in enumerate(order_centroids[i, :10]): #replace 6 with n words per cluster
        

array([[0, 1, 9, 2, 3, 5, 8, 6, 7, 4],
       [1, 0, 9, 8, 5, 2, 6, 3, 7, 4],
       [2, 7, 8, 3, 6, 0, 1, 5, 4, 9],
       [3, 8, 4, 2, 0, 7, 5, 6, 1, 9],
       [4, 8, 3, 2, 7, 6, 0, 5, 1, 9],
       [5, 9, 1, 8, 0, 3, 2, 6, 7, 4],
       [6, 7, 8, 2, 5, 1, 3, 0, 4, 9],
       [7, 6, 2, 8, 3, 5, 1, 4, 0, 9],
       [8, 4, 1, 3, 7, 2, 6, 5, 0, 9],
       [9, 5, 0, 1, 8, 2, 7, 6, 4, 3]], dtype=int64)

In [105]:
order_centroids[0, :10]

array([0, 1, 9, 2, 3, 5, 8, 6, 7, 4], dtype=int64)

In [106]:
terms[0]

"'a"

In [112]:
sc.affinity_matrix_

array([[1.        , 0.53173923, 0.14831525, 0.14583543, 0.13864201,
        0.14442935, 0.14075505, 0.13886153, 0.14254869, 0.16894208],
       [0.53173923, 1.        , 0.14461183, 0.14005377, 0.13739586,
        0.15046942, 0.14217691, 0.13986649, 0.15949888, 0.16752942],
       [0.14831525, 0.14461183, 1.        , 0.1534091 , 0.14080902,
        0.14244372, 0.14985849, 0.18817661, 0.15472162, 0.13948326],
       [0.14583543, 0.14005377, 0.1534091 , 1.        , 0.1559068 ,
        0.1424716 , 0.1418383 , 0.14263596, 0.15862083, 0.13665866],
       [0.13864201, 0.13739586, 0.14080902, 0.1559068 , 1.        ,
        0.13782362, 0.13879806, 0.13964768, 0.20998865, 0.13728369],
       [0.14442935, 0.15046942, 0.14244372, 0.1424716 , 0.13782362,
        1.        , 0.14224166, 0.14178897, 0.14487185, 0.4820613 ],
       [0.14075505, 0.14217691, 0.14985849, 0.1418383 , 0.13879806,
        0.14224166, 1.        , 0.25023642, 0.15119474, 0.13771769],
       [0.13886153, 0.13986649, 0.1881766

In [18]:
# lr = LogisticRegression()
# train = lr.fit(X_train_tfidf, y_train_tfidf)
# print('Training set score:', lr.score(X_train_tfidf, y_train_tfidf))
# print('Test set score:', lr.score(X_test_tfidf, y_test_tfidf))

In [19]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

#Our SVD data reducer.  We are going to reduce the feature space from 7543 to 950.
svd= TruncatedSVD(850)
lsa = make_pipeline(svd, Normalizer(copy=False))
# Run SVD on the training data, then project the training data.
X_train_lsa = lsa.fit_transform(X_train_tfidf)

variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:",total_variance*100)

#Looking at what sorts of paragraphs our solution considers similar, for the first five identified topics
sents_by_component=pd.DataFrame(X_train_lsa,index=X_train)
for i in range(3):
    print('Component {}:'.format(i))
    print(sents_by_component.loc[:,i].sort_values(ascending=False)[0:10])

NameError: name 'X_train_tfidf' is not defined

In [None]:
sents_by_component.head()

In [None]:
X_train_svd, X_test_svd, y_train_svd, y_test_svd = train_test_split(sents_by_component, y_train, test_size=0.4, random_state=0)

In [None]:
# random forest with tf idf
rfc = ensemble.RandomForestClassifier()

train = rfc.fit(X_train_tfidf, y_train_tfidf)
print('Training set score:', rfc.score(X_train_tfidf, y_train_tfidf))
print('Test set score:', rfc.score(X_test_tfidf, y_test_tfidf))

In [None]:
# random forest with SVD
train = rfc.fit(X_train_svd, y_train_svd)
print('Training set score:', rfc.score(X_train_svd, y_train_svd))
print('Test set score:', rfc.score(X_test_svd, y_test_svd))