<a href="https://colab.research.google.com/github/YI-CHENG-SHIH645/ML/blob/master/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# !wget "https://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz"
# !tar -xzvf "review_polarity.tar.gz" -C "."

In [None]:
!python -m nltk.downloader all

In [None]:
!wget "https://github.com/YI-CHENG-SHIH645/ML-in-Business_practice/raw/master/utils.py"

In [39]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support
from sklearn.svm import SVC, LinearSVC
import nltk
from nltk.corpus import movie_reviews
import string
import re
import pandas as pd
import numpy as np
from scipy import sparse
from utils import display_side_by_side

In [5]:
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [40]:
corpus = [
    'good',
    'good',
    'good bad',
    'bad',
    'bad',
    'bad',
    '',
    'bad',
    '',
    'good'
]
vectorizer = CountVectorizer()
vectorizer.fit_transform(corpus).toarray()

array([[0, 1],
       [0, 1],
       [1, 1],
       [1, 0],
       [1, 0],
       [1, 0],
       [0, 0],
       [1, 0],
       [0, 0],
       [0, 1]])

In [41]:
p_words_given_pos = (0.667*0.5*0.375)
p_words_given_neg = (0.2*0.4*0.3125)
p_words_given_neut = (0.4*0.6*0.3125)
p_words = p_words_given_pos + p_words_given_neg + p_words_given_neut

# [[0, 1]] 機率
print(round(p_words_given_neg/p_words, 3))
print(round(p_words_given_neut/p_words, 3))
print(round(p_words_given_pos/p_words, 3))

0.111
0.333
0.556


In [42]:
X = vectorizer.fit_transform(corpus).toarray()
y = ['Pos', 'Pos', 'Pos', 'Pos', 'Neg', 'Neg', 'Neg', 'Neut', 'Neut', 'Neut']
y = LabelEncoder().fit_transform(y)
print(y)

[2 2 2 2 0 0 0 1 1 1]


In [50]:
# https://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html
model = BernoulliNB().fit(X, y)
model.predict_proba([[0, 1]])

array([[0.10465116, 0.31395349, 0.58139535]])

In [51]:
X = [movie_reviews.raw(fileid) for fileid in movie_reviews.fileids()]
y = [movie_reviews.categories(fileid)[0] for fileid in movie_reviews.fileids()]
X[0], y[0], len(X), len(y)

('plot : two teen couples go to a church party , drink and then drive . \nthey get into an accident . \none of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . \nwhat\'s the deal ? \nwatch the movie and " sorta " find out . . . \ncritique : a mind-fuck movie for the teen generation that touches on a very cool idea , but presents it in a very bad package . \nwhich is what makes this review an even harder one to write , since i generally applaud films which attempt to break the mold , mess with your head and such ( lost highway & memento ) , but there are good and bad ways of making all types of films , and these folks just didn\'t snag this one correctly . \nthey seem to have taken this pretty neat concept , but executed it terribly . \nso what are the problems with the movie ? \nwell , its main problem is that it\'s simply too jumbled . \nit starts off " normal " but then downshifts into this " fantasy " world in which you , as an audience memb

In [52]:
nltk.corpus.stopwords.words('english')[:5]

['i', 'me', 'my', 'myself', 'we']

In [53]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [54]:
tknzr = nltk.tokenize.TweetTokenizer(strip_handles=True, reduce_len=True)
s1 = '@remy: This is waaaaayyyy too much for you!!!!!!'
tknzr.tokenize(s1)

[':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!']

In [55]:
lemmatizer = nltk.WordNetLemmatizer()
stop_words = nltk.corpus.stopwords.words('english')

# tokenize, remove punct, lowercase, stop words, lemmatize
# 沒做的： spell checking, abbreviation, remove rare words(keep 20% ~ 80% freq words)
def tokenize(document):
    document = re.sub(r'\d+', '', document)
    tknzr = nltk.tokenize.TweetTokenizer(strip_handles=True, reduce_len=True)
    for word in tknzr.tokenize(document):
        if word in stop_words or all(w in string.punctuation for w in word):
            continue
        word = lemmatizer.lemmatize(word)
        yield word

In [56]:
X_cleaned = [" ".join(list(tokenize(doc))) for doc in X]
y = LabelEncoder().fit_transform(y)

In [57]:
vectorizer = CountVectorizer()
X_wordcountvec = vectorizer.fit_transform(X_cleaned)
X_wordcountvec = sparse.csr_matrix(np.clip(X_wordcountvec.toarray(), 0, 1))

In [58]:
X_wordcountvec.shape

(2000, 35022)

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X_wordcountvec, y, test_size=0.3,
                                                    stratify=y, random_state=3)

In [61]:
models = [BernoulliNB().fit(X_train, y_train),
          LogisticRegression().fit(X_train, y_train),
          LinearSVC().fit(X_train, y_train),
          SVC().fit(X_train, y_train)]

In [62]:
def metrics(model):
    cf = confusion_matrix(y_test, model.predict(X_test))
    tn, fp, fn, tp = cf.ravel()
    recall = tp / (tp + fn)
    precision = tp / (tp + fp)
    acc = (tp + tn) / (tp + tn + fp + fn)
    return pd.DataFrame(cf, index=['negative', 'positive'], columns=['negative', 'positive'])\
                        .rename_axis('truth', axis=0).rename_axis('pred', axis=1), precision, recall, acc

In [63]:
res = [metrics(model) for model in models]
res = list(zip(*res))

In [64]:
display_side_by_side(res[0],
                     names=[m.__class__.__name__ for m in models],
                     descriptions=[f'acc: {acc:.3f} <br> precision: {precision:.3f} <br> recall: {recall:.3f}'
                      for precision, recall, acc in zip(res[1], res[2], res[3])])

pred,negative,positive,Unnamed: 3_level_0
truth,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
pred,negative,positive,Unnamed: 3_level_2
truth,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3
pred,negative,positive,Unnamed: 3_level_4
truth,Unnamed: 1_level_5,Unnamed: 2_level_5,Unnamed: 3_level_5
pred,negative,positive,Unnamed: 3_level_6
truth,Unnamed: 1_level_7,Unnamed: 2_level_7,Unnamed: 3_level_7
negative,258,42,
positive,91,209,
negative,252,48,
positive,43,257,
negative,250,50,
positive,44,256,
negative,243,57,
positive,46,254,
BernoulliNB,LogisticRegression,LinearSVC,SVC
pred  negative  positive  truth  negative  258  42  positive  91  209,pred  negative  positive  truth  negative  252  48  positive  43  257,pred  negative  positive  truth  negative  250  50  positive  44  256,pred  negative  positive  truth  negative  243  57  positive  46  254

pred,negative,positive
truth,Unnamed: 1_level_1,Unnamed: 2_level_1
negative,258,42
positive,91,209

pred,negative,positive
truth,Unnamed: 1_level_1,Unnamed: 2_level_1
negative,252,48
positive,43,257

pred,negative,positive
truth,Unnamed: 1_level_1,Unnamed: 2_level_1
negative,250,50
positive,44,256

pred,negative,positive
truth,Unnamed: 1_level_1,Unnamed: 2_level_1
negative,243,57
positive,46,254
