## LOAD THE DATA

In [5]:

import csv

# To process with another file, change the file name
with open('movie_reviews.csv', 'r', encoding="utf8") as csv_file:
    read = csv.reader(csv_file, delimiter = ',', quotechar = '"')
    
    # Initialize reviews and data
    # The reviews are the observations, labels are ... straighforward
    reviews = []
    labels = []
    
    # Fill in reviews and data
    for row in read: 
        if row[0] and row[1]:
            reviews.append(row[0])
            label = 1 if row[1].lower() == 'positive' else -1
            labels.append(label)

## Data preprocessing with NLTK


In [6]:
# Remove punctuation, lower all characters
# exclude = {',' ,'+', '<', ':', '/', ']', '(', ')', '{', '"', '_', '?', '@', '}', ...}
import string
exclude = set(string.punctuation)
for review in reviews:
    review = ''.join([w for w in review.lower() if w not in exclude])

In [7]:
# Remove stop words based on the given list - To be changed depending on the needs
from nltk.corpus import stopwords
stopwords = stopwords.words("english")
for review in reviews:
    review = ''.join([w for w in review if w not in stopwords])

In [8]:
# Steeming -> Reduce words to their initial mining
from nltk.stem.porter import *
stemmer = PorterStemmer()
for review in reviews:
    review = stemmer.stem(review)

## Tfidf Mtrix

In [41]:
# Features extraction with TF - IDF : get the matrix
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

m = TfidfVectorizer()
tfidf_matrix = m.fit_transform(reviews)

print("Size of the tfidf matrix: ", tfidf_matrix.size)

Size of the tfidf matrix:  267478


In [42]:
# Split the tf-idf matrix into two data sets to process the cross validation : training and test set
from sklearn.cross_validation import train_test_split

data_train, data_test, label_train, label_test = train_test_split(tfidf_matrix, labels, test_size = 0.4, random_state = 42)

## Select the appropriate classification (Neural Network, SVM, Bayesian classifier, Decision Trees, ...) to train the model
## Here : Logistic regression and Random Forests

In [43]:
# Test 1 : Logistic Regression Classifier

# Model initialization 
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()

# Fit the model to the training dataset
y_score = clf.fit(data_train, label_train).predict_proba(data_test)

# Perform classification on the test dataset
label_predicted = clf.predict(data_test)

In [44]:
# Evaluate the prediction
from sklearn.metrics import classification_report, accuracy_score
classification_report(label_test, label_predicted)

print('The accuracy score is {:.2%}'.format(accuracy_score(label_test, label_predicted)))

The accuracy score is 80.10%


## Get the results : Recall, precision & F1-score

In [51]:
# ROC curve and Area Under the Curve
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

fpr, tpr, thresholds = roc_curve(label_test, label_predicted)
roc_auc = auc(fpr, tpr)

print("Area Under the Curve: ", roc_auc)

#Plot of the AUC
plt.clf
plt.plot(fpr, tpr, label = 'ROC Curve (Area = {:.2%})'.format(roc_auc))
plt.plot([0,1], [0,1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc = "lower right")
plt.show()

Area Under the Curve:  0.801053118023


## Test 2 : Random Forest

In [62]:
from sklearn.ensemble import RandomForestClassifier

rf = []
n_est = [5,10,20,50,100,200,400]
label_predicted = []
for i, n in enumerate(n_est):
    rf.append(RandomForestClassifier(n_estimators=n))
    rf[i] = rf[i].fit(data_train, label_train)
    label_predicted.append(rf[i].predict(data_test))
    classification_report(label_test, label_predicted[i])
    print('The accuracy score for {} is {:.2%}'.format(n, accuracy_score(label_test, label_predicted[i])))

The accuracy score for 5 is 61.86%
The accuracy score for 10 is 66.84%
The accuracy score for 20 is 73.09%
The accuracy score for 50 is 73.98%
The accuracy score for 100 is 77.68%
The accuracy score for 200 is 76.66%
The accuracy score for 400 is 78.95%


## Test 3 : Decision Trees

In [68]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()
clf = clf.fit(data_train, label_train)
label_predicted_3 = clf.predict(data_test)

classification_report(label_test, label_predicted_3)

print('The accuracy score is {:.2%}'.format(accuracy_score(label_test, label_predicted_3)))

The accuracy score is 67.47%


## Test 4 : Extra Tree Classifier

In [69]:
from sklearn.ensemble import ExtraTreesClassifier

clf = ExtraTreesClassifier()
clf = clf.fit(data_train, label_train)

label_predicted_4 = clf.predict(data_test)

classification_report(label_test, label_predicted_4)
print('The accuracy score is {:.2%}'.format(accuracy_score(label_test, label_predicted_4)))

The accuracy score is 68.49%
