In [2]:
import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt

In [3]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.graphics.api as smg
import patsy
from sklearn import cross_validation
from sklearn import linear_model
from sklearn import metrics
from sklearn import tree
from sklearn import neighbors
from sklearn import svm
from sklearn import ensemble
from sklearn import cluster

从台词预测是出自悲剧/喜剧/历史剧.参考:
* [Building a Genre Classifier for Shakespeare Speeches](http://www.adampalay.com/notebooks/shakespeare-genre-classifier.html)
* [Text Mining the Complete Works of William Shakespeare](http://www.r-bloggers.com/text-mining-the-complete-works-of-william-shakespeare/)

In [4]:
# 从搜集到的 xml 转成了 dataframe, 输出了 csv 
lines_all = pd.read_csv('lines_all.csv')

In [5]:
lines_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100731 entries, 0 to 100730
Data columns (total 4 columns):
Unnamed: 0      100731 non-null int64
genre           100731 non-null object
plays_name      100731 non-null object
speech_lines    100731 non-null object
dtypes: int64(1), object(3)
memory usage: 3.8+ MB


In [6]:
lines_all.drop(['Unnamed: 0'], axis = 1, inplace = True)

In [7]:
lines_all.head()

Unnamed: 0,genre,plays_name,speech_lines
0,comedy,All’s Well That Ends Well,"In delivering my son from me, I bury a second ..."
1,comedy,All’s Well That Ends Well,"And I in going, madam, weep o’er my father’s d..."
2,comedy,All’s Well That Ends Well,"You shall find of the King a husband, madam; y..."
3,comedy,All’s Well That Ends Well,What hope is there of his Majesty’s amendment?
4,comedy,All’s Well That Ends Well,"He hath abandon’d his physicians, madam, under..."


参考 `sklearn` 中如何处理文本数据
* http://scikit-learn.org/stable/modules/feature_extraction.html
* http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

另外需要处理文本数据的话, 可以用到 `nltk` 这个包, 参考
* [nltk 官网](http://www.nltk.org/)
* [Natural Language Processing with Python](http://www.nltk.org/book/)
* [demo](http://text-processing.com/demo/)

In [8]:
X_all = lines_all.speech_lines
Y_all = lines_all.genre

In [9]:
# split train, test
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X_all, Y_all, train_size=0.8)

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [11]:
count_vec = CountVectorizer()
X_train_counts = count_vec.fit_transform(X_train)
X_train_counts.shape

(80584, 21390)

In [12]:
# 一个巨大的 matrix
count_vec.vocabulary_.get(u'love')

11079

In [13]:
# use tf-idf
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(80584, 21390)

In [14]:
# multinomial variant in naive bayes classifier
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, Y_train)

In [15]:
# 对 test data 处理为 tfidf, 进行 predict
X_test_counts = count_vec.transform(X_test)
X_test_counts.shape

(20147, 21390)

In [16]:
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

In [17]:
Y_predicted = clf.predict(X_test_tfidf)

In [18]:
np.mean(Y_predicted == Y_test)

0.53605995929915129

In [19]:
# 准确率比较低, 换linear SVM, 被认为是 best text classification algorithm
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier().fit(X_train_tfidf, Y_train)

In [20]:
Y_predicted_svm = clf.predict(X_test_tfidf)

In [21]:
np.mean(Y_predicted_svm == Y_test)

0.53387601131682139

In [22]:
# 额 并没有提高
import nltk
from nltk.stem import PorterStemmer
import string

In [23]:
# 使用 stemming
stemmer = PorterStemmer()
def tokenize_and_stem(text):
    tokens = nltk.tokenize.word_tokenize(text)
    # strip out punctuation and make lowercase
    tokens = [token.lower().strip(string.punctuation)
              for token in tokens if token.isalnum()]

    # now stem the tokens
    tokens = [stemmer.stem(token) for token in tokens]

    return tokens

In [24]:
count_vec2 = CountVectorizer(tokenizer = tokenize_and_stem)

In [1]:
# nltk data 要先 download
X_train_tokenized = count_vec2.fit_transform(X_train)

clf = MultinomialNB()
clf.fit(X_train_tokenized, Y_train)

NameError: name 'count_vec2' is not defined

In [35]:
# grid search to determine parameters

from sklearn.grid_search import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-1, 1e-3),}

In [26]:
from sklearn.pipeline import Pipeline

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, n_iter=5, random_state=42)),])

In [36]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)

In [37]:
gs_clf = gs_clf.fit(X_train[:10000],Y_train[:10000])

In [38]:
best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])

In [39]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 2)


In [40]:
Y_predicted_gs = gs_clf.predict(X_test)

In [41]:
np.mean(Y_predicted_gs==Y_test)

0.46289770189110041