In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.graphics.api as smg
import patsy
from sklearn import cross_validation
from sklearn import linear_model
from sklearn import metrics
from sklearn import tree
from sklearn import neighbors
from sklearn import svm
from sklearn import ensemble
from sklearn import cluster

从台词预测是出自悲剧/喜剧/历史剧.参考:
* [Building a Genre Classifier for Shakespeare Speeches](http://www.adampalay.com/notebooks/shakespeare-genre-classifier.html)
* [Text Mining the Complete Works of William Shakespeare](http://www.r-bloggers.com/text-mining-the-complete-works-of-william-shakespeare/)

In [3]:
# 从搜集到的 xml 转成了 dataframe, 输出了 csv 
lines_all = pd.read_csv('lines_all.csv')

In [4]:
lines_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100731 entries, 0 to 100730
Data columns (total 4 columns):
Unnamed: 0      100731 non-null int64
genre           100731 non-null object
plays_name      100731 non-null object
speech_lines    100731 non-null object
dtypes: int64(1), object(3)
memory usage: 3.8+ MB


In [9]:
lines_all.drop(['Unnamed: 0'], axis = 1, inplace = True)

In [10]:
lines_all.head()

Unnamed: 0,genre,plays_name,speech_lines
0,comedy,All’s Well That Ends Well,"In delivering my son from me, I bury a second ..."
1,comedy,All’s Well That Ends Well,"And I in going, madam, weep o’er my father’s d..."
2,comedy,All’s Well That Ends Well,"You shall find of the King a husband, madam; y..."
3,comedy,All’s Well That Ends Well,What hope is there of his Majesty’s amendment?
4,comedy,All’s Well That Ends Well,"He hath abandon’d his physicians, madam, under..."


参考 `sklearn` 中如何处理文本数据
* http://scikit-learn.org/stable/modules/feature_extraction.html
* http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

另外需要处理文本数据的话, 可以用到 `nltk` 这个包, 参考
* [nltk 官网](http://www.nltk.org/)
* [Natural Language Processing with Python](http://www.nltk.org/book/)
* [demo](http://text-processing.com/demo/)

In [11]:
X_all = lines_all.speech_lines
Y_all = lines_all.genre

In [12]:
# split train, test
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X_all, Y_all, train_size=0.8)

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [27]:
count_vec = CountVectorizer()
X_train_counts = count_vec.fit_transform(X_train)
X_train_counts.shape

(80584, 21229)

In [28]:
# 一个巨大的 matrix
count_vec.vocabulary_.get(u'love')

10969

In [29]:
# use tf-idf
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(80584, 21229)

In [30]:
# multinomial variant in naive bayes classifier
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, Y_train)

In [31]:
# 对 test data 处理为 tfidf, 进行 predict
X_test_counts = count_vec.transform(X_test)
X_test_counts.shape

(20147, 21229)

In [32]:
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

In [33]:
Y_predicted = clf.predict(X_test_tfidf)

In [34]:
np.mean(Y_predicted == Y_test)

0.54052712562664418

In [35]:
# 准确率比较低, 换linear SVM, 被认为是 best text classification algorithm
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier().fit(X_train_tfidf, Y_train)

In [36]:
Y_predicted_svm = clf.predict(X_test_tfidf)

In [37]:
np.mean(Y_predicted_svm == Y_test)

0.52841614136099668

In [38]:
# 额 并没有提高, 反而略有降低, 或许与某些参数设置有关
clf.score(X_test_tfidf, Y_test)

0.52841614136099668