In [42]:
from sklearn.datasets import load_files
from sklearn import feature_extraction
import nltk
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import (linear_model, naive_bayes, ensemble)

In [33]:
data_directory = 'movie_reviews_polarity'
movie_sentiment_data = load_files(data_directory,shuffle=True)

print('{} files loaded'.format(len(movie_sentiment_data.data)))
print('Thay contain the following classes: {}'.format(movie_sentiment_data.target_names))

2000 files loaded
Thay contain the following classes: ['neg', 'pos']


### Podemos extraer la siguiente informacion del load_files:

 1. target_names: nombre de las clases que corresponde al nombre de carpeta
 2. target: id de las clases
 3. filenames: obtenemos la ruta completa del archivo.

In [34]:
def extract_features(corpus):
    sa_stop_words = nltk.corpus.stopwords.words("english")
    
    # words that might invert a sentence's meaning
    white_list = [
        'what', 'but', 'if', 'because', 'as', 'until', 'against', 'up', 'down', 'in',
        'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here',
        'there', 'why', 'how', 'all', 'any', 'most', 'other', 'some', 'such', 'no', 'nor',
        'not', 'only', 'own', 'same', 'so', 'than', 'too', 'can', 'will', 'just', 'don', 'should'
    ]
    
    # take these out of the standard NLTK stop word list
    sa_stop_words = [sw for sw in sa_stop_words if sw not in white_list]
    
    # vectorize means we turn non-numerical data into an array of numbers
    count_vectorizer = feature_extraction.text.CountVectorizer(
        lowercase = True, 
        tokenizer=nltk.word_tokenize, # Use the NLTK tokenizer.
        #stop_words='english', # Remove stop words.
        min_df=2,  # Minimun document frequency, i.e. the word must appear more than once.
        ngram_range = (1,2), # Define ngrams value. In this case we have 1-gram and 2-gram
        stop_words = sa_stop_words
    )
    
    processed_corpus = count_vectorizer.fit_transform(corpus)
    processed_corpus = feature_extraction.text.TfidfTransformer().fit_transform(processed_corpus)
    
    #print(count_vectorizer.vocabulary_)
    return processed_corpus

In [35]:
movie_tfidf = extract_features(movie_sentiment_data.data)

X_train, X_test, y_train, y_test = train_test_split(
    movie_tfidf, movie_sentiment_data.target, test_size = 0.30, random_state = 42)

  'stop_words.' % sorted(inconsistent))


In [36]:
clf1 = LogisticRegression()
clf1.fit(X_train,y_train)
print('Logistic Regression performance {}'.format(clf1.score(X_test, y_test)))

Logistic Regression performance 0.8166666666666667




In [38]:
clf2 = linear_model.SGDClassifier()
clf2.fit(X_train,y_train)
print('SGDCClassifier performance {}'.format(clf2.score(X_test, y_test)))

SGDCClassifier performance 0.87




In [39]:
clf3 = naive_bayes.MultinomialNB()
clf3.fit(X_train,y_train)
print('MultinomialNB performance {}'.format(clf3.score(X_test, y_test)))

MultinomialNB performance 0.785


In [40]:
clf4 = naive_bayes.BernoulliNB()
clf4.fit(X_train,y_train)
print('BernoulliNB performance {}'.format(clf4.score(X_test, y_test)))

BernoulliNB performance 0.8


In [43]:
voting_model = ensemble.VotingClassifier(
    estimators = [('lr',clf1),('sgd',clf2),('mnb',clf3),('bnb',clf4)],
    voting = 'hard'
)

voting_model.fit(X_train,y_train)
print('Voting classifier performance:{}'.format(voting_model.score(X_test,y_test)))

Voting classifier performance:0.8116666666666666


