# A. Category prediction with Multinomial Naive Bayesian

In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
#Category map : "key(name of dataset):values"
category_map = {"talk.politics.misc":"Politices",
                "rec.autos":"Autos",
                "rec.sport.hockey":"Hockey",
                "sci.electronics":"Electronics",
                "sci.med":"Medicine"}

In [3]:
#load training data
training_data = fetch_20newsgroups(subset="train", categories=category_map.keys(), shuffle=True, random_state=5)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [4]:
#create count vectorizer object
count_vectorizer = CountVectorizer()
train_tc = count_vectorizer.fit_transform(training_data.data)
print("\nDimensions of training data:", train_tc.shape)


Dimensions of training data: (2844, 40321)


In [5]:
#creat tfidf vectoriczer object
tfidf = TfidfTransformer()
train_tfidf = tfidf.fit_transform(train_tc)

In [6]:
#test data
input_data = ['You need to be careful with cars when you are driving on slippery roads', 
              'A lot of devices can be operated wirelessly',
              'Players need to be careful when they are close to goal posts',
              'Political debates help us understand the perspectives of both sides']

In [7]:
#MultinomialNB classifier
classifier = MultinomialNB().fit(train_tfidf, training_data.target)
input_tc = count_vectorizer.transform(input_data) #preprocessing test data to frequency table
input_tfidf = tfidf.transform(input_tc) #preprocessing test data to tfidf matrix
predictions = classifier.predict(input_tc)

In [30]:
input_tc

<4x40321 sparse matrix of type '<class 'numpy.int64'>'
	with 40 stored elements in Compressed Sparse Row format>

In [31]:
input_tfidf

<4x40321 sparse matrix of type '<class 'numpy.float64'>'
	with 40 stored elements in Compressed Sparse Row format>

In [33]:
input_data

['You need to be careful with cars when you are driving on slippery roads',
 'A lot of devices can be operated wirelessly',
 'Players need to be careful when they are close to goal posts',
 'Political debates help us understand the perspectives of both sides']

In [34]:
predictions

array([0, 2, 1, 4])

In [32]:
training_data.target_names

['rec.autos',
 'rec.sport.hockey',
 'sci.electronics',
 'sci.med',
 'talk.politics.misc']

In [8]:
#result
for sent, category in zip(input_data, predictions) : 
    print("\nInput:",sent,"\nPredicted category:",category_map[training_data.target_names[category]])


Input: You need to be careful with cars when you are driving on slippery roads 
Predicted category: Autos

Input: A lot of devices can be operated wirelessly 
Predicted category: Electronics

Input: Players need to be careful when they are close to goal posts 
Predicted category: Hockey

Input: Political debates help us understand the perspectives of both sides 
Predicted category: Politices


<br><hr><br>

# B. Gender classifier with Multinomial Naive Bayesian

In [9]:
import random
from nltk import NaiveBayesClassifier #이번엔 다른 모듈을 사용한다
from nltk.classify import accuracy as nltk_accuracy
from nltk.corpus import names

In [10]:
def extract_features(word, N=2) : #N의 default는 2, 입력값에 따라 다르게
    last_n_letters = word[-N:]
    return {"feature":last_n_letters.lower()}

In [11]:
male_list = [(name,"male") for name in names.words("male.txt")]
female_list = [(name,"female") for name in names.words("female.txt")]
data = (male_list + female_list)

In [36]:
data

[('Dasi', 'female'),
 ('Marius', 'male'),
 ('Marlene', 'female'),
 ('Margaret', 'female'),
 ('Nickie', 'female'),
 ('Lovell', 'male'),
 ('Jermaine', 'male'),
 ('Sonja', 'female'),
 ('Mayer', 'male'),
 ('Ed', 'male'),
 ('Daffy', 'female'),
 ('Tamra', 'female'),
 ('Benson', 'male'),
 ('Bertrand', 'male'),
 ('Luelle', 'female'),
 ('Gigi', 'female'),
 ('Nilson', 'male'),
 ('Golda', 'female'),
 ('Donica', 'female'),
 ('Juliana', 'female'),
 ('Laurens', 'male'),
 ('Ange', 'female'),
 ('Moises', 'male'),
 ('Melisa', 'female'),
 ('Emmie', 'female'),
 ('Nitin', 'female'),
 ('Rivkah', 'female'),
 ('Tyson', 'male'),
 ('Fianna', 'female'),
 ('Gustave', 'male'),
 ('Byron', 'male'),
 ('Harvard', 'male'),
 ('Lyssa', 'female'),
 ('Bunni', 'female'),
 ('Guglielma', 'female'),
 ('Tami', 'female'),
 ('Genevieve', 'female'),
 ('Devan', 'female'),
 ('Kiele', 'female'),
 ('Godfrey', 'male'),
 ('Cathryn', 'female'),
 ('Eunice', 'female'),
 ('Maurise', 'female'),
 ('Estele', 'female'),
 ('Phillip', 'male'),
 

In [12]:
random.seed(5)
random.shuffle(data)

In [13]:
#test data
input_names = ["Alexander", "Danielle", "David", "Cheryl"]

In [15]:
#split data into training * validation set to find N
num_train = int(0.8*len(data))

#finding N 
for i in range(1,6) : 
    print("\nNumber of end letters:",i)
    features = [(extract_features(n, i), gender) for (n, gender) in data] #n: 이름, gender: 해당 이름의 성별
    
    train_data, valid_data = features[:num_train], features[num_train:]
    classifier = NaiveBayesClassifier.train(train_data)

    #accuracy calculated with validation dataset
    accuracy = round(100 * nltk_accuracy(classifier, valid_data), 2)
    print("Accuracy = " + str(accuracy) + "%")

    #check result on test data
    for name in input_names : 
        print(name,"==>", classifier.classify(extract_features(name,i)))


Number of end letters: 1
Accuracy = 74.7%
Alexander ==> male
Danielle ==> female
David ==> male
Cheryl ==> male

Number of end letters: 2
Accuracy = 78.79%
Alexander ==> male
Danielle ==> female
David ==> male
Cheryl ==> female

Number of end letters: 3
Accuracy = 77.22%
Alexander ==> male
Danielle ==> female
David ==> male
Cheryl ==> female

Number of end letters: 4
Accuracy = 69.98%
Alexander ==> male
Danielle ==> female
David ==> male
Cheryl ==> female

Number of end letters: 5
Accuracy = 64.63%
Alexander ==> male
Danielle ==> female
David ==> male
Cheryl ==> female


<br><hr><br>

# Sentimental Analysis with Multinomial Naive Bayesian

In [16]:
from nltk.corpus import movie_reviews
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy as nltk_accuracy

In [17]:
def extract_features(words) : 
    return dict([(word, True) for word in words])

In [39]:
movie_reviews

<bound method CategorizedCorpusReader.fileids of <CategorizedPlaintextCorpusReader in '/home/ubuntu/nltk_data/corpora/movie_reviews'>>

In [19]:
fileids_pos = movie_reviews.fileids("pos")
fileids_neg = movie_reviews.fileids("neg")

In [38]:
fileids_pos

['pos/cv000_29590.txt',
 'pos/cv001_18431.txt',
 'pos/cv002_15918.txt',
 'pos/cv003_11664.txt',
 'pos/cv004_11636.txt',
 'pos/cv005_29443.txt',
 'pos/cv006_15448.txt',
 'pos/cv007_4968.txt',
 'pos/cv008_29435.txt',
 'pos/cv009_29592.txt',
 'pos/cv010_29198.txt',
 'pos/cv011_12166.txt',
 'pos/cv012_29576.txt',
 'pos/cv013_10159.txt',
 'pos/cv014_13924.txt',
 'pos/cv015_29439.txt',
 'pos/cv016_4659.txt',
 'pos/cv017_22464.txt',
 'pos/cv018_20137.txt',
 'pos/cv019_14482.txt',
 'pos/cv020_8825.txt',
 'pos/cv021_15838.txt',
 'pos/cv022_12864.txt',
 'pos/cv023_12672.txt',
 'pos/cv024_6778.txt',
 'pos/cv025_3108.txt',
 'pos/cv026_29325.txt',
 'pos/cv027_25219.txt',
 'pos/cv028_26746.txt',
 'pos/cv029_18643.txt',
 'pos/cv030_21593.txt',
 'pos/cv031_18452.txt',
 'pos/cv032_22550.txt',
 'pos/cv033_24444.txt',
 'pos/cv034_29647.txt',
 'pos/cv035_3954.txt',
 'pos/cv036_16831.txt',
 'pos/cv037_18510.txt',
 'pos/cv038_9749.txt',
 'pos/cv039_6170.txt',
 'pos/cv040_8276.txt',
 'pos/cv041_21113.txt',
 

In [20]:
#extract features from reviews
features_pos = [(extract_features(movie_reviews.words(fileids=[f])), "Positive") for f in fileids_pos]
features_neg = [(extract_features(movie_reviews.words(fileids=[f])), "Negative") for f in fileids_neg]

In [41]:
print(movie_reviews.words(fileids=["pos/cv010_29198.txt"]))
print(movie_reviews.words(fileids=["pos/cv036_16831.txt"]))
print(movie_reviews.words(fileids=["pos/cv042_10982.txt"]))

['after', 'watching', '"', 'rat', 'race', '"', 'last', ...]
['dora', '(', 'fernanda', 'montenegro', ')', 'sits', ...]
['will', 'hunting', '(', 'matt', 'damon', ')', 'is', ...]


In [21]:
#split into training & test dataset
threshold = 0.8
num_pos = int(threshold * len(features_pos))
num_neg = int(threshold * len(features_neg))

features_train = features_pos[:num_pos] + features_neg[:num_neg]
features_test = features_pos[num_pos:] + features_neg[num_neg:]

In [22]:
print("\nNumber of training datapoints:", len(features_train))
print("\nNumber of test datapoints:", len(features_test))


Number of training datapoints: 1600

Number of test datapoints: 400


In [24]:
#Multinomial Naive Bayes classifier
classifier = NaiveBayesClassifier.train(features_train)
print("\nAccuracy of the classifier:", nltk_accuracy(classifier, features_test))


Accuracy of the classifier: 0.735


In [42]:
classifier.most_informative_features()

[('outstanding', True),
 ('insulting', True),
 ('vulnerable', True),
 ('ludicrous', True),
 ('uninvolving', True),
 ('avoids', True),
 ('astounding', True),
 ('fascination', True),
 ('anna', True),
 ('seagal', True),
 ('affecting', True),
 ('animators', True),
 ('symbol', True),
 ('darker', True),
 ('idiotic', True),
 ('annual', True),
 ('represent', True),
 ('illogical', True),
 ('palpable', True),
 ('strengths', True),
 ('hatred', True),
 ('bothered', True),
 ('offbeat', True),
 ('mulan', True),
 ('seamless', True),
 ('lighthearted', True),
 ('naval', True),
 ('fairness', True),
 ('doubts', True),
 ('moody', True),
 ('hudson', True),
 ('studies', True),
 ('religion', True),
 ('magnificent', True),
 ('chuckle', True),
 ('gaining', True),
 ('unwittingly', True),
 ('unimaginative', True),
 ('detract', True),
 ('frances', True),
 ('excruciatingly', True),
 ('treasure', True),
 ('taxi', True),
 ('slip', True),
 ('embodies', True),
 ('mpaa', True),
 ('winslet', True),
 ('tad', True),
 ('da

In [25]:
#show the most 15 informative words to classifying pos & neg
N = 15
print("\nTop " + str(N) + " most informative words:")
for i, item in enumerate(classifier.most_informative_features()) : 
    print(str(i+1) + ". " + item[0])
    if i == N-1 : 
        break


Top 15 most informative words:
1. outstanding
2. insulting
3. vulnerable
4. ludicrous
5. uninvolving
6. avoids
7. astounding
8. fascination
9. anna
10. seagal
11. affecting
12. animators
13. symbol
14. darker
15. idiotic


In [26]:
input_reviews = ["The coustumes in this movie were great", 
                "I Think the story was terrible and the characters were very weak", 
                "People say that the director of the movie is amazing", 
                "This is such an idiotic movie. I will not recommend it to anyone"]

In [43]:
input_reviews[0].split()

['The', 'coustumes', 'in', 'this', 'movie', 'were', 'great']

In [44]:
extract_features(input_reviews[0].split())

{'The': True,
 'coustumes': True,
 'great': True,
 'in': True,
 'movie': True,
 'this': True,
 'were': True}

In [54]:
probabilities = classifier.prob_classify(extract_features(input_reviews[0].split()))
probabilities

<ProbDist with 2 samples>

In [52]:
predicted_sentiment = probabilities.max()
predicted_sentiment

'Positive'

In [55]:
probabilities.prob(predicted_sentiment)

0.5471062575091977

In [29]:
print("\nMovie review predictions:")
for review in input_reviews : 
    print("\nReview:", review)
    probabilities = classifier.prob_classify(extract_features(review.split()))
    predicted_sentiment = probabilities.max()
    print("Predicted sentiment:", predicted_sentiment) 
    print("Probabilities:", round(probabilities.prob(predicted_sentiment), 2))


Movie review predictions:

Review: The coustumes in this movie were great
Predicted sentiment: Positive
Probabilities: 0.55

Review: I Think the story was terrible and the characters were very weak
Predicted sentiment: Negative
Probabilities: 0.79

Review: People say that the director of the movie is amazing
Predicted sentiment: Positive
Probabilities: 0.6

Review: This is such an idiotic movie. I will not recommend it to anyone
Predicted sentiment: Negative
Probabilities: 0.88
