In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from collections import Counter
from nltk.corpus import stopwords
from sklearn import ensemble
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

In [84]:
# Utility function to clean text.
def text_cleaner(text):
    
    # Visual inspection shows spaCy does not recognize the double dash '--'.
    return [re.sub(r'--', '', word) for word in text]

def raw_text_cleaner(text):
    text = re.sub(r'──────', ' ', text)
    text = re.sub(r'--',' ',text)
    text = re.sub("\[", "", text)
    text = re.sub("\]", "", text)
    text = re.sub("\\\\\\\\", "", text)
    text = re.sub("\\\\", "", text)
 
    # Get rid of extra whitespace.
    text = ' '.join(text.split())
    
    return text

def load_spacy(list_of_docs, max_length=2000000):
    
    # load spacy
    print('Running Spacy...')
    nlp = spacy.load('en')
    nlp.max_length = max_length
    
    # set empty list; holds processed list of docs
    nlp_docs = []
    
    for i, doc in enumerate(list_of_docs):
        print('Processing file {}'.format(i+1))
        doc = raw_text_cleaner(doc)
        nlp_docs.append(nlp(doc))
    print('Done processing')
    return nlp_docs

# put the tokens of a doc in a list
def convert_to_tokens(doc):
  token_list = []
  for token in doc:
    if not token.is_stop and not token.is_punct:
      token_list.append(token.lemma_.lower())
  return token_list

In [3]:
# loading files
filenames = ['against_the_gods', 'battle_through_the_heavens', 'desolate_era', 'emperors_domination', 'martial_god_asura', 'martial_world', 'overgeared', 'praise_the_orc', 'sovereign_of_the_three_realms', 'wu_dong_qian_kun']
raw_files = []

for filename in filenames:
    with open('../dataset/' + filename + '.txt', encoding='utf-8') as myfile:
        raw_files.append(myfile.read())

In [4]:
# check length of files
for file in raw_files:
    print(len(file))

183613
135767
198176
137377
132773
201182
167151
170911
196830
155296


In [59]:
# run spacy
processed_files = load_spacy(raw_files)

Running Spacy...
Processing file 1
Processing file 2
Processing file 3
Processing file 4
Processing file 5
Processing file 6
Processing file 7
Processing file 8
Processing file 9
Processing file 10
Done processing


In [60]:
# get sentences and compile
all_sentences = []
all_sentences_label = []
for i, file in enumerate(processed_files):
    for sentence in file.sents:
        all_sentences.append(str(sentence))
        all_sentences_label.append(filenames[i])

In [164]:
print('There are', len(all_sentences), 'sentences')

There are 24287 sentences


In [161]:
vectorizer = TfidfVectorizer(max_df=0.75, # drop words that occur in more than 3/4 the sentence
                             min_df=2, # only use words that appear at least twice
                             stop_words='english', 
                             lowercase=False, #don't convert everything to lower case
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )

# Apply the vectorizer
vec_all_sents = vectorizer.fit_transform(all_sentences)

In [128]:
print("Number of features: %d" % vec_all_sents.get_shape()[1])

Number of features: 7543


In [162]:
# splitting into training and test sets
# keeps sentence structure
X_train, X_test, y_train, y_test = train_test_split(all_sentences, all_sentences_label, test_size=0.4, random_state=0)
# scores of tfidf
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(vec_all_sents, all_sentences_label, test_size=0.4, random_state=0)

# force output into compressed sparse row if it isn't already; readable format
X_train_tfidf_csr = X_train_tfidf.tocsr()

In [148]:
n = X_train_tfidf_csr.shape[0]
terms = vectorizer.get_feature_names()

# create empty list of dictionary, per sentence
sents_tfidf = [{} for _ in range(0,n)]

# for each sentence, list feature words and tf-idf score
for i, j in zip(*X_train_tfidf_csr.nonzero()):
    sents_tfidf[i][terms[j]] = X_train_tfidf_csr[i, j]

OG sentence: If you selected an orc, can you really endure it?
tf-idf vec: {'orc': 0.38768266100540216, 'selected': 0.615361908167517, 'endure': 0.5663338509112565, 'really': 0.38768266100540216}


In [165]:
# log base 2 of 1 is 0, so a tf-idf score of 0 indicates that the word was present once in this corpus.
print('OG sentence:', X_train[3])
print('tf-idf vec:', sents_tfidf[3])

OG sentence: If you selected an orc, can you really endure it?
tf-idf vec: {'orc': 0.38768266100540216, 'selected': 0.615361908167517, 'endure': 0.5663338509112565, 'really': 0.38768266100540216}


In [118]:
X_train[8][:102]

"The tiger's body flying through the air was immensely large, causing even Ian to flinch for a moment. "

In [93]:
X_train[8][:]

'The tiger\'s body flying through the air was immensely large, causing even Ian to flinch for a moment. "'

In [119]:
# lr = LogisticRegression()
# train = lr.fit(X_train_tfidf, y_train_tfidf)
# print('Training set score:', lr.score(X_train_tfidf, y_train_tfidf))
# print('Test set score:', lr.score(X_test_tfidf, y_test_tfidf))

Training set score: 0.8110214686175304
Test set score: 0.6886473182237719


In [182]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

#Our SVD data reducer.  We are going to reduce the feature space from 7543 to 950.
svd= TruncatedSVD(850)
lsa = make_pipeline(svd, Normalizer(copy=False))
# Run SVD on the training data, then project the training data.
X_train_lsa = lsa.fit_transform(X_train_tfidf)

variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:",total_variance*100)

#Looking at what sorts of paragraphs our solution considers similar, for the first five identified topics
sents_by_component=pd.DataFrame(X_train_lsa,index=X_train)
for i in range(3):
    print('Component {}:'.format(i))
    print(sents_by_component.loc[:,i].sort_values(ascending=False)[0:10])

Percent variance captured by all components: 55.60112867653755
Component 0:
Lin Xiao remarked. "                                                                                                0.561253
Lin Xiao said.                                                                                                      0.502035
This was Lin Dong's father, Lin Xiao.                                                                               0.483478
This was because Lin Xiao was injured once again...                                                                 0.468415
This was because Lin Xiao was injured once again...                                                                 0.468415
Lin Xiao extended his palm and signalled to Lin Dong:                                                               0.437992
Looking at Lin Dong with doting eyes, she understood that Lin Xiao had invested everything he had into Lin Dong.    0.429803
Lin Xiao looked Lin Dong in the eye and asked him

In [193]:
sents_by_component.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,840,841,842,843,844,845,846,847,848,849
Jiang Chen held many profound and flowery conversations with the visitors before finally agreeing to treat the princess in three days.,0.055208,0.007985,0.002144,0.018281,0.024521,0.042389,0.025492,0.02537,-0.037478,0.013844,...,-0.016571,0.029853,-0.007111,0.009331,-0.014426,-0.002391,0.016366,-0.01661,0.001388,0.002426
"Another disciple of the sect scornfully laughed: ""Heh, the Cleansing Incense Ancient Sect is only a second-rate establishment.",0.041384,-0.007311,0.00253,0.029153,0.002118,0.031205,0.000226,0.000844,0.000365,0.00775,...,0.016054,-0.00188,0.006004,-0.01076,0.016136,-0.012841,0.000258,0.006339,0.008038,-0.006582
It was a defense type magic that could be acquired with a black magician reached level 230.,0.07713,0.00655,0.014536,0.153935,0.206566,-0.087973,-0.01619,-0.023685,-0.00481,-0.003911,...,0.017051,0.0162,-0.020635,0.008843,-0.004224,0.011871,-0.037168,0.033563,0.021943,-0.030618
"If you selected an orc, can you really endure it?",0.07368,0.008999,0.00627,0.048429,0.046273,0.068411,0.027307,0.010658,-0.025217,0.025685,...,-0.034595,-0.042489,-0.005446,0.008788,0.019061,-0.025807,0.029852,0.02535,0.002656,-0.017841
"At the time, I thought that it was just a baseless rumour, but now I realize that it's actually true.",0.103623,0.016841,0.005413,0.030889,0.037032,0.024744,0.043649,0.022207,-0.013411,0.043325,...,0.006308,0.020512,-0.015124,0.009032,0.009049,-0.001825,-0.00582,-0.008339,0.01501,0.008272


In [174]:
X_train_svd, X_test_svd, y_train_svd, y_test_svd = train_test_split(sents_by_component, y_train, test_size=0.4, random_state=0)

In [176]:
# random forest with tf idf
rfc = ensemble.RandomForestClassifier()

train = rfc.fit(X_train_tfidf, y_train_tfidf)
print('Training set score:', rfc.score(X_train_tfidf, y_train_tfidf))
print('Test set score:', rfc.score(X_test_tfidf, y_test_tfidf))

Training set score: 0.9626681306615427
Test set score: 0.6429233144621719


In [177]:
# random forest with SVD
train = rfc.fit(X_train_svd, y_train_svd)
print('Training set score:', rfc.score(X_train_svd, y_train_svd))
print('Test set score:', rfc.score(X_test_svd, y_test_svd))

Training set score: 0.9687750200160128
Test set score: 0.4570252187339166
