In [1]:
import glob
import codecs
import numpy
from pandas import DataFrame
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, confusion_matrix

SOURCES = [
    ('imdb1//pos//*.txt', 'GOOD'),
    ('imdb1//neg//*.txt', 'BAD')
]

In [2]:
def read_files(path):
    files = glob.glob(path)
    for file in files:
        with codecs.open(file,'r', encoding='utf8', errors='ignore') as f:
            text = f.read()
            text = text.replace('\n',' ')
            yield file, text

In [3]:
def build_data_frame(path, classification):
    rows = []
    index = []
    for file_name, text in read_files(path):
        rows.append({'text': text, 'class': classification})
        index.append(file_name)
    data_frame = DataFrame(rows, index=index)
    return data_frame

In [4]:
data = DataFrame({'text':[],'class':[]})
for path, classification in SOURCES:
    data = data.append(build_data_frame(path, classification))
    
data = data.reindex(numpy.random.permutation(data.index))

In [5]:
pipline = Pipeline([
    ('vect', CountVectorizer(ngram_range=(2, 2), lowercase=True)),
    ('tfidf', TfidfTransformer(use_idf=True, smooth_idf=True)),
    ('clf', MultinomialNB(alpha=1))
])

In [6]:
k_fold = KFold(n_splits=10)
scores = []
confusion = numpy.array([[0,0], [0,0]])
for train_indices, test_indices in k_fold.split(data):
    train_text = data.iloc[train_indices]['text'].values
    train_y = data.iloc[train_indices]['class'].values.astype(str)
    
    test_text = data.iloc[test_indices]['text'].values
    test_y = data.iloc[test_indices]['class'].values.astype(str)
    pipline.fit(train_text, train_y)
    predictions = pipline.predict(test_text)
    
    confusion += confusion_matrix(test_y, predictions)
    score = f1_score(test_y, predictions, pos_label='GOOD')
    scores.append (score)

In [7]:
print('Total documents classified: ', len(data))
print('score: ', round(sum(scores)/len(scores),2))
print('Confusion matrix:')
print(confusion)

Total documents classified:  2000
score:  0.84
Confusion matrix:
[[780 220]
 [111 889]]


In [8]:
pipline.predict(['good'])

array(['BAD'], dtype='<U4')

In [None]:
import numpy as np
in_arr1 = geek.array([[2, -7, 5], [-6, 2, 0]]) 
in_arr2 = geek.array([[5, 8, -5], [3, 6, 9]]) 