In [1]:
import numpy as np
import re
from nltk.corpus import stopwords
stop=stopwords.words('english')
def tokenizer(text):
    text=re.sub('<[^>]*>','',text)
    emoticons=re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text)
    text=(re.sub('[\W]+',' ',text.lower())+' '.join(emoticons).replace('-',''))
    tokenized=[w for w in text.split() if w not in stop]
    return tokenized

def stream_docs(path):
    with open(path,'r',encoding='utf-8') as csv:
        next(csv)
        for line in csv:
            text,label=line[:-3],int(line[-2])
            yield text,label

In [2]:
next(stream_docs(path='movie_data.csv'))

('"In 1974, the teenager Martha Moxley (Maggie Grace) moves to the high-class area of Belle Haven, Greenwich, Connecticut. On the Mischief Night, eve of Halloween, she was murdered in the backyard of her house and her murder remained unsolved. Twenty-two years later, the writer Mark Fuhrman (Christopher Meloni), who is a former LA detective that has fallen in disgrace for perjury in O.J. Simpson trial and moved to Idaho, decides to investigate the case with his partner Stephen Weeks (Andrew Mitchell) with the purpose of writing a book. The locals squirm and do not welcome them, but with the support of the retired detective Steve Carroll (Robert Forster) that was in charge of the investigation in the 70\'s, they discover the criminal and a net of power and money to cover the murder.<br /><br />""Murder in Greenwich"" is a good TV movie, with the true story of a murder of a fifteen years old girl that was committed by a wealthy teenager whose mother was a Kennedy. The powerful and rich f

In [7]:
def get_minibatch(doc_stream,size):
    docs,y=[],[]
    try:
        for _ in range(size):
            text,label=next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None,None
    return docs,y

In [8]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

In [16]:
vect=HashingVectorizer(decode_error='ignore',n_features=2**21,preprocessor=None,tokenizer=tokenizer)
clf=SGDClassifier(loss='log',random_state=1,n_iter=1)
doc_stream=stream_docs(path='movie_data.csv')

In [17]:
import pyprind
pbar=pyprind.ProgBar(45)
classes=np.array([0,1])
for _ in range(45):
    X_train,y_train=get_minibatch(doc_stream,size=1000)
    if not X_train:
        break
    X_train=vect.transform(X_train)
    clf.partial_fit(X_train,y_train,classes=classes)
    pbar.update()



0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:01:24


In [18]:
X_test,y_test=get_minibatch(doc_stream,size=4999)
X_test=vect.transform(X_test)
print('Accuracy:%.3f' % clf.score(X_test,y_test))

Accuracy:0.867


In [19]:
clf=clf.partial_fit(X_test,y_test)



In [30]:
clf.score(X_test,y_test)

0.8809761952390478

In [22]:
# LDA with sklearn
import pandas as pd
df=pd.read_csv('movie_data.csv',encoding='utf-8')
df=df.dropna(subset=['review'])

In [23]:
from sklearn.feature_extraction.text import CountVectorizer
count=CountVectorizer(stop_words='english',max_df=0.1,max_features=5000)
X=count.fit_transform(df['review'].values)

In [24]:
from sklearn.decomposition import LatentDirichletAllocation
lda=LatentDirichletAllocation(n_topics=10,random_state=123,learning_method='batch')
X_topics=lda.fit_transform(X)



In [25]:
lda.components_.shape

(10, 5000)

In [28]:
n_top_words=5
feature_names=count.get_feature_names()
for topic_idx,topic in enumerate(lda.components_):
    print('Topic %d:' % (topic_idx+1))
    print(' '.join([feature_names[i] for i in topic.argsort()[:-n_top_words-1:-1]]))

Topic 1:
worst minutes script awful stupid
Topic 2:
family mother father children girl
Topic 3:
american war dvd music tv
Topic 4:
human audience cinema art sense
Topic 5:
police guy car dead murder
Topic 6:
horror house sex blood girl
Topic 7:
role performance comedy actor performances
Topic 8:
series episode war episodes tv
Topic 9:
book version original read novel
Topic 10:
action fight guy guys cool


In [31]:
import pickle
import os

In [43]:
dest = os.path.join('movieclassifier','pkl_objects')   #将多个路径组合后返回
if not os.path.exists(dest):
    os.makedirs(dest)                                  #创建movieclassifier文件,以及子文件pkl_objects

In [44]:
# 通过open函数内部的wb参数，我们以二进制模式pickle打开文件，设置protocol=4来选择最新且最有效的pickle协议。
pickle.dump(stop,open(os.path.join(dest,'stopwords.pkl'),'wb'),protocol=4)   

In [45]:
pickle.dump(clf,open(os.path.join(dest,'classifier.pkl'),'wb'),protocol=4)

In [62]:
%%writefile movieclassifier/vectorizer.py
from sklearn.feature_extraction.text import HashingVectorizer
import re
import os
import pickle

cur_dir=os.path.dirname(__file__)
stop=pickle.load(open(os.path.join(cur_dir,'pkl_objects','stopwords.pkl'),'rb'))
def tokenizer(text):
    text=re.sub('<[^>]*>','',text)
    emoticons=re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text)
    text=(re.sub('[\W]+',' ',text.lower())+' '.join(emoticons).replace('-',''))
    tokenized=[w for w in text.split() if w not in stop]
    return tokenized

vect=HashingVectorizer(decode_error='ignore',n_features=2**21,preprocessor=None,tokenizer=tokenizer)

Writing movieclassifier/vectorizer.py


In [49]:
# join用于连接字符串数组。    'sep'.join(seq)
seq1=['hello','good','boy','doiido']
print(' '.join(seq1))
print(':'.join(seq1))

hello good boy doiido
hello:good:boy:doiido


In [54]:
seq2='hello good boy doiido'
print(' '.join(seq2))
print(':'.join(seq2))

h e l l o   g o o d   b o y   d o i i d o
h:e:l:l:o: :g:o:o:d: :b:o:y: :d:o:i:i:d:o


In [55]:
seq3=('hello','good','boy','boiido')
print(' '.join(seq3))
print(':'.join(seq3))

hello good boy boiido
hello:good:boy:boiido


In [56]:
seq4={'hello':1,'good':2,'boy':3,'doiido':4}
print(' '.join(seq4))
print(':'.join(seq4))

hello good boy doiido
hello:good:boy:doiido


In [57]:
# os.path.join(): 将多个路径组合后返回   第一个绝对路径之前的参数将被忽略
import os
os.path.join('/hello/','good/boy','doiido')

'/hello/good/boy\\doiido'

In [59]:
# 打印乘法口诀
print ('\n'.join([' '.join(['%s*%s=%-2s' % (y,x,x*y) for  y in  range(1,x+1)])for  x in range(1,10)]))

1*1=1 
1*2=2  2*2=4 
1*3=3  2*3=6  3*3=9 
1*4=4  2*4=8  3*4=12 4*4=16
1*5=5  2*5=10 3*5=15 4*5=20 5*5=25
1*6=6  2*6=12 3*6=18 4*6=24 5*6=30 6*6=36
1*7=7  2*7=14 3*7=21 4*7=28 5*7=35 6*7=42 7*7=49
1*8=8  2*8=16 3*8=24 4*8=32 5*8=40 6*8=48 7*8=56 8*8=64
1*9=9  2*9=18 3*9=27 4*9=36 5*9=45 6*9=54 7*9=63 8*9=72 9*9=81
