In [38]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
import string
import fasttext
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [2]:
# Normalize text
def normalize_text(texts, stops=None):
    # Lower case
    texts = [x.lower() for x in texts]

    # EOL
    texts = [re.compile(r"\\n").sub(' ',x) for x in texts]
    
    # Replace punctuation with space
    texts = [x.translate(x.maketrans(string.punctuation, ' '*len(string.punctuation))) for x in texts]
    
    # Remove numbers
    texts = [''.join(c for c in x if c not in '0123456789') for x in texts]
    
    if stops:
        # Remove stopwords
        texts = [' '.join([word for word in x.split() if word not in (stops)]) for x in texts]

#     # split into list
#     texts = [x.split() for x in texts]
    
    return(texts)

In [3]:
def load_Imdb_data():
    train = pd.read_csv('data/Large Movie Review Dataset/train.csv')
    test = pd.read_csv('data/Large Movie Review Dataset/test.csv')
    train_X = train['text'].values.tolist()
    train_y = train['label'].values.tolist()
    test_X = test['text'].values.tolist()
    test_y = test['label'].values.tolist()
    return train,test

In [4]:
train_data, test_data = load_Imdb_data()
print('IMDB train data size: %d' % len(train_data))
print(train_data.groupby('label').size())
print('IMDB test data size: %d' % len(test_data))
print(test_data.groupby('label').size())

IMDB train data size: 25000
label
0    12500
1    12500
dtype: int64
IMDB test data size: 25000
label
0    12500
1    12500
dtype: int64


In [5]:
train_X = train_data['text'].values.tolist()
train_y = train_data['label'].values.tolist()
test_X = test_data['text'].values.tolist()
test_y = test_data['label'].values.tolist()

In [6]:
stops = stopwords.words('english')
normalized_train_X = normalize_text(train_X, stops)
normalized_test_X = normalize_text(test_X, stops)

In [7]:
train_prefix_y = ['__label__'+ str(y) for y in train_y]
test_prefix_y = ['__label__'+ str(y) for y in test_y]

In [8]:
train_prefix_data=pd.DataFrame({'train_y': train_prefix_y,'text': normalized_train_X})
test_prefix_data=pd.DataFrame({'train_y': test_prefix_y,'text': normalized_test_X})

In [9]:
train_prefix_data.to_csv('train_prefix.txt', sep='\t', index = False, header = False)
test_prefix_data.to_csv('test_prefix.txt', sep='\t', index = False, header = False)

In [10]:
model = fasttext.train_supervised('train_prefix.txt')

In [13]:
print(normalized_test_X[0],test_prefix_y[0])
print(model.predict(test_X[0]))

went saw movie last night coaxed friends mine admit reluctant see knew ashton kutcher able comedy wrong kutcher played character jake fischer well kevin costner played ben randall professionalism sign good movie toy emotions one exactly entire theater sold overcome laughter first half movie moved tears second half exiting theater saw many women tears many full grown men well trying desperately let anyone see crying movie great suggest go see judge __label__1
(('__label__1',), array([0.81123638]))


In [14]:
validation = model.test('test_prefix.txt')

In [27]:
def print_results(N, p, r):
    print("N\t\t" + str(N))
    print("precision\t{:.3f}".format(p))
    print("recall\t\t{:.3f}".format(r))
print_results(*validation)

N		25000
precision	0.877
recall		0.877


In [32]:
y_predict = [model.predict(text)[0][0] for text in normalized_test_X]

In [35]:
accuracy_score(test_prefix_y, y_predict)

0.87744

In [37]:
confusion_matrix(test_prefix_y, y_predict)

array([[11211,  1289],
       [ 1775, 10725]], dtype=int64)

In [39]:
print(classification_report(test_prefix_y, y_predict, target_names=['__label__0','__label__1']))

              precision    recall  f1-score   support

  __label__0       0.86      0.90      0.88     12500
  __label__1       0.89      0.86      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000

