In [1]:
import numpy as np
import pandas as pd
import modSpellChecker as sc
from contractions import CONTRACTION_MAP
import re
import nltk
import string
from nltk.stem import WordNetLemmatizer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import gensim
from gensim.models import Word2Vec
from gensim import models
from pattern.en import tag
from nltk.corpus import wordnet as wn
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from gensim import corpora, models
from normalization import normalize_corpus
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB



In [2]:
dataset = pd.read_csv('imdb_indonesian_movies.csv')
dataset[1:5]

Unnamed: 0,judul_film,ringkasan_sinopsis,genre
1,5 cewek jagoan,Film aksi laga tahun 1980 dari Indonesia yang ...,Aksi
2,Aladin dan lampu wasiat,Aladin (Rano Karno) adalah remaja yatim yang h...,Aksi
3,Blood Warriors,Seorang mantan Kelautan menemukan seorang tema...,Aksi
4,Buffalo Boys,Dua saudara bernama Jamar dan Suwo kembali ke ...,Aksi


In [3]:
feature = dataset.iloc[:,1]
label = dataset.iloc[:,2]
print(feature[1:5])
print(" ")
print("----------------------------")
print(label[1:5])

1    Film aksi laga tahun 1980 dari Indonesia yang ...
2    Aladin (Rano Karno) adalah remaja yatim yang h...
3    Seorang mantan Kelautan menemukan seorang tema...
4    Dua saudara bernama Jamar dan Suwo kembali ke ...
Name: ringkasan_sinopsis, dtype: object
 
----------------------------
1    Aksi
2    Aksi
3    Aksi
4    Aksi
Name: genre, dtype: object


In [4]:
#fungsi2 untuk normalisasi dataset
character = ['z','y','x','w','v','u','t','s','r','q','p','o','n','m','l','k','j','i','h','g','f','e','d','c','b','a',
             ',','.',';',':','-','...','?','!','(',')','[',']','{','}','<','>','"','/','\'','#','-','@']
def repeatcharNormalize(text):
    for i in range(len(character)):
        charac_long = 5
        while charac_long>=2:
            char=character[i]*charac_long
            text=text.replace(char,character[i])
            charac_long-=1
        return text

def spellNormalize(text):
    spellCheck = []
    for i in text:
        if i not in character:
            j=sc.correction(i)
            spellCheck.append(j)
        else:
            spellCheck.append(i)
        return spellCheck  
    
def tokenize_text(text):
    tokens=nltk.word_tokenize(text)
    tokens=[token.strip() for token in tokens]
    return tokens

def expand_contractions(text, contraction_mapping):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
                                    flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                            if contraction_mapping.get(match)\
                            else contraction_mapping.get(match.lower())
        expanded_contraction = first_char + expanded_contraction[1:]
        return expanded_contraction
    
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'","", expanded_text)
    return expanded_text

def stemmer_text(text):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    text = stemmer.stem(text)
    return text

In [5]:
def remove_special_characters(text):
    tokens = tokenize_text(text)
    pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
    filtered_tokens = filter(None, [pattern.sub('',token) for token in tokens])
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

factory = StopWordRemoverFactory()
stopword_list = factory.get_stop_words()

def remove_stopwords(text):
    tokens = tokenize_text(text)
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

def normalize_corpus(corpus, tokenize=False):
    normalized_corpus = []
    for text in corpus:
        text = expand_contractions(text, CONTRACTION_MAP)
        text = stemmer_text(text)
        text = remove_special_characters(text)
        text = repeatcharNormalize(text)
        text = remove_stopwords(text)
        
        normalized_corpus.append(text)
        if tokenize:
            text = tokenize_text(text)
            text = spellNormalize(text)
            normalized_corpus.append(text)
    return normalized_corpus

In [6]:
#Fungsi untuk mengekstraksi feature

def tfidf_transformer(bow_matrix):
    transformer = TfidfTransformer(norm='l2',
                                  smooth_idf=True,
                                  use_idf=True)
    tfidf_matrix = transformer.fit_transform(bow_matrix)
    return transformer, tfidf_matrix

def tfidf_extractor(corpus, ngram_range=(1,1)):
    vectorizer = TfidfVectorizer(min_df=1,
                                norm='l2',
                                smooth_idf=True,
                                use_idf=True,
                                ngram_range=ngram_range)
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features

def bow_extractor(corpus, ngram_range=(1,1)):
    vectorizer = CountVectorizer(min_df=1, ngram_range=ngram_range)
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features

def average_word_vectors(words, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary:
            nwords = nwords+1.
            feature_vector = np.add(feature_vector, model[word])
            
        if nwords:
            feature_vectore = np.divide(feature_vector, nwords)
        return feature_vector
    
def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
               for tokenized_sentence in corpus]
    return np.array(features)

def tfidf_wtd_avg_word_vectors(words, tfidf_vector, tfidf_vocabulary, model, num_features):
    word_tfidfs = [tfidf_vector[0, tfidf_vocabulary.get(word)]
                  if tfidf_vocabulary.get(word)
                  else 0 for word in words]
    word_tfidf_map = {word:tfidf_val for word, tfidf_val in zip(words, word_tfidfs)}
    feature_vector = np.zeros((num_features,),dtype="float64")
    vocabulary = set(model.wv.index2word)
    wts = 0.
    for word in words:
        if word in vocabulary:
            word_vector = model[word]
            weighted_word_vector = word_tfidf_map[word] * word_vector
            wts = wts + word_tfidf_map[word]
            feature_vector = np.add(feature_vector, weighted_word_vector)
    
    if wts:
        feature_vector = np.divide(feature_vector, wts)
        
    return feature_vector

def tfidf_weighted_averaged_word_vectorizer(corpus, tfidf_vectors,
                                          tfidf_vocabulary, model, num_features):
    docs_tfidfs = [(doc, doc_tfidf)
                  for doc, doc_tfidf
                  in zip(corpus, tfidf_vectors)]
    features = [tfidf_wtd_avg_word_vectors(tokenized_sentence, tfidf, tfidf_vocabulary,
                                          model, num_features)
               for tokenized_sentence, tfidf in docs_tfidfs]
    
    return np.array(features)

In [7]:
#training dataset
def prepare_datasets(corpus, labels, test_data_proportion=0.3):
    train_X, test_X, train_Y, test_Y = train_test_split(corpus, labels,
                                                       test_size=0.33, random_state=42)
    return train_X, test_X, train_Y, test_Y

def remove_empty_docs(corpus, labels):
    filtered_corpus = []
    filtered_labels = []
    for doc, label in zip(corpus, labels):
        if doc.strip():
            filtered_corpus.append(doc)
            filtered_labels.append(label)
    return filtered_corpus, filtered_labels

train_corpus, test_corpus, train_labels, test_labels = prepare_datasets(feature,
                                                                       label,
                                                                       test_data_proportion=0.3)
#memanggil fungsi normalisasi dataset yang telah ditraining
norm_train_corpus = normalize_corpus(train_corpus)
norm_test_corpus = normalize_corpus(test_corpus)
''.strip()

''

In [9]:
bow_vectorizer, bow_train_features = bow_extractor(norm_train_corpus)
bow_test_features = bow_vectorizer.transform(norm_test_corpus)

tfidf_vectorizer, tfidf_train_features = tfidf_extractor(norm_train_corpus)
tfidf_test_features = tfidf_vectorizer.transform(norm_test_corpus)

tokenized_train = [nltk.word_tokenize(text)
                  for text in norm_train_corpus]

tokenized_test = [nltk.word_tokenize(text)
                 for text in norm_test_corpus]

model = gensim.models.Word2Vec(tokenized_train,
                              size=500,
                              window=100,
                              min_count=30,
                              sample=1e-3)

  "C extension not loaded, training will be slow. "


In [10]:
from sklearn import metrics
import numpy as np

def get_metrics(true_labels, predicted_labels):
    
    print('Accuracy:', np.round(
                        metrics.accuracy_score(true_labels,
                                              predicted_labels),
                        2))
    print('Precision:', np.round(
                        metrics.precision_score(true_labels,
                                              predicted_labels,
                                               average='weighted'),
                        2))
    print('Recall:', np.round(
                        metrics.recall_score(true_labels,
                                              predicted_labels,
                                               average='weighted'),
                        2))
    print('F1 Score:', np.round(
                        metrics.f1_score(true_labels,
                                              predicted_labels,
                                               average='weighted'),
                        2))

In [11]:
def train_predict_evaluate_model (classifier, 
                                 train_features, 
                                 train_labels,
                                test_features, 
                                 test_labels):
    classifier.fit(train_features, train_labels)
    predictions = classifier.predict(test_features)
    get_metrics(true_labels = test_labels,
               predicted_labels=predictions)
    return predictions

In [12]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

mnb = MultinomialNB()
svm = SGDClassifier(loss='hinge', n_iter=100)

mnb_bow_predictions = train_predict_evaluate_model(classifier=mnb,
                                                  train_features=bow_train_features,
                                                  train_labels=train_labels,
                                                  test_features=bow_test_features,
                                                  test_labels=test_labels)
print('------------------------------------------')
svm_bow_predictions = train_predict_evaluate_model(classifier=svm,
                                                  train_features=bow_train_features,
                                                  train_labels=train_labels,
                                                  test_features=bow_test_features,
                                                  test_labels=test_labels)
print('------------------------------------------')
mnb_tfidf_predictions = train_predict_evaluate_model(classifier=mnb,
                                                  train_features=tfidf_train_features,
                                                  train_labels=train_labels,
                                                  test_features=tfidf_test_features,
                                                  test_labels=test_labels)
print('------------------------------------------')
svm_tfidf_predictions = train_predict_evaluate_model(classifier=svm,
                                                  train_features=tfidf_train_features,
                                                  train_labels=train_labels,
                                                  test_features=tfidf_test_features,
                                                  test_labels=test_labels)
print('------------------------------------------')

Accuracy: 0.48
Precision: 0.49
Recall: 0.48
F1 Score: 0.47
------------------------------------------
Accuracy: 0.34
Precision: 0.42
Recall: 0.34
F1 Score: 0.37
------------------------------------------
Accuracy: 0.45
Precision: 0.49
Recall: 0.45
F1 Score: 0.43
------------------------------------------


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Accuracy: 0.42
Precision: 0.43
Recall: 0.42
F1 Score: 0.43
------------------------------------------


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


In [17]:
from sklearn.pipeline import make_pipeline

pipe= make_pipeline(tfidf_vectorizer,mnb)
pipe.fit(train_corpus,train_labels)

print(pipe.score(test_corpus,test_labels))

tempFeature=normalize_corpus(feature)

0.4307228915662651


KeyboardInterrupt: 

In [25]:
tempData=pd.DataFrame(np.column_stack([feature,label]),columns=('Feature','Label'))
tempData
tempRest = [pipe,tempData]
line = np.array(['percintaan'])
pipe.predict(line)
joblibFile = "MNBClassifier.pkl"
from sklearn.externals import joblib
joblib.dump(tempRest,joblibFile)

['MNBClassifier.pkl']