# Techniques of Artificial Intelligence
## Project - Sentiment Analysis on an imdb dataset
### Gaspard BERNARD 
### Adrien HANS

### Import of the wanted libraries : 

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import re
import pandas as pd
import numpy as numpy
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array
from sklearn.preprocessing import LabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.metrics import accuracy_scores


#Plotting : 
import matplotlib.pyplot as plt


### Preprocessing : 
* #### Listing trainning and testing sets

In [2]:
reviews_train = []
for line in open('movie_data/full_train.txt', 'r', encoding="utf8"):
    
    reviews_train.append(line.strip())
    
reviews_test = []
for line in open('movie_data/full_test.txt', 'r', encoding="utf8"):
    
    reviews_test.append(line.strip())

*Affichage d'une critique avant Clean* :

In [3]:
print(reviews_test[3])

I saw this film in a sneak preview, and it is delightful. The cinematography is unusually creative, the acting is good, and the story is fabulous. If this movie does not do well, it won't be because it doesn't deserve to. Before this film, I didn't realize how charming Shia Lebouf could be. He does a marvelous, self-contained, job as the lead. There's something incredibly sweet about him, and it makes the movie even better. The other actors do a good job as well, and the film contains moments of really high suspense, more than one might expect from a movie about golf. Sports movies are a dime a dozen, but this one stands out. <br /><br />This is one I'd recommend to anyone.


* #### Removing unwanted characters : 

In [5]:
REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
NO_SPACE = ""
SPACE = " "

def preprocess_reviews(reviews):
    
    reviews = [REPLACE_NO_SPACE.sub(NO_SPACE, line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(SPACE, line) for line in reviews]
    
    return reviews

reviews_train_clean = preprocess_reviews(reviews_train)
reviews_test_clean = preprocess_reviews(reviews_test)

*Affichage de la même critique après clean* : 

In [6]:
print(reviews_test_clean[3])

i saw this film in a sneak preview and it is delightful the cinematography is unusually creative the acting is good and the story is fabulous if this movie does not do well it wont be because it doesnt deserve to before this film i didnt realize how charming shia lebouf could be he does a marvelous self contained job as the lead theres something incredibly sweet about him and it makes the movie even better the other actors do a good job as well and the film contains moments of really high suspense more than one might expect from a movie about golf sports movies are a dime a dozen but this one stands out  this is one id recommend to anyone


* #### Vectorizing the training set :

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(lowercase=True, binary=True)

cv.fit(reviews_train_clean)
X = cv.transform(reviews_train_clean)

* #### Vectorizing the final test set :
# !!!!  (non utilisé pour l'instant)

In [8]:
X_test_final = cv.transform(reviews_test_clean)

* #### Organizing the data :

In [9]:
target = [1 if i < 12500 else 0 for i in range(25000)]
# Definition of X_train et y_train : 
X_train, X_test, y_train, y_test = train_test_split(
    X, target, test_size = 0.2
)

# Functions : 

In [10]:
def sigmoid(x):
    return 1/(1 + numpy.exp(-x))

def sigmoid_derivative(x):
    return x * (1 - x)


def classif(x):
    return 1 * (x > 0.5)

def relu(x):
    return x * (x > 0)

def relu_derivative(x):
    return 1. * (x > 0)

## Neural Network : 

* #### Initialisation des paramètres : 

In [11]:
x=X_train
n1,n2=x.shape
binarizer_ = LabelBinarizer().fit(y_train)
y = binarizer_.transform(y_train)
yy = binarizer_.transform(y_test)

#On établit un nombre max d'epochs et de neurones dans la couche cachée : 
N_Epoch_Max = 300
N_neurones_Max = 150

#Nbr min : 
N_Epoch_Min = 50
N_neurones_Min = 50

# Step : 
Step_Epoch = 50
Step_neurones = 50

* #### Entrainement et validation : 

In [12]:
#Boucle sur le nombre de neurones : 

for nbr_neurones in range(N_neurones_Min, N_neurones_Max, Step_neurones): 
    w1 = numpy.random.randn(n2,nbr_neurones)
    w2 = numpy.random.randn(nbr_neurones,1)
    dw1 = numpy.random.randn(n2,nbr_neurones)
    dw2 = numpy.random.randn(nbr_neurones,1)
    l2 = 0
    
    #Initializing the L_nbr_epochs :
    L_nbr_epochs=[]
    L_Error=[]
    
    for nbr_epochs in range(N_Epoch_Min, N_Epoch_Max, Step_Epoch): 
        
        for k in range(nbr_epochs):
            #feedforward pour les couches l1 et l2 (relu l1 et sigmoid l2)
            l1 = relu(x.dot(w1))
            l2 = sigmoid(l1.dot(w2))


            #backprop
            dw2 = l1.T.dot(2*(y - l2) * sigmoid_derivative(l2))
            dw1 = x.T.dot(numpy.dot(2*(y - l2) * sigmoid_derivative(l2), w2.T) * relu_derivative(l1))

            #vitesse de descente
            w1 += 0.005*dw1
            w2 += 0.005*dw2
            
        #validation du modèle
        l1 = sigmoid(X_test.dot(w1))
        l2 = sigmoid(l1.dot(w2))
        error = abs(yy-classif(l2))
        print("Error : ", numpy.mean(error), " nbr epochs : ", nbr_epochs, " nbr_neurones : ", nbr_neurones)
        
        #Plotting the results : 
        L_nbr_epochs.append(nbr_epochs)
        L_Error.append(numpy.mean(error))
    
    #Plot pour chaque nbr_neurones : 
    plt.plot(L_nbr_epochs, L_Error)
    plt.xlabel("number of epochs")
    plt.ylabel("Error")
    plt.title("Error and the number of epochs for  %i  neurones in the hidden layer" %nbr_neurones)
    plt.show()

  


Error :  0.512  nbr epochs :  50  nbr_neurones :  50
Error :  0.2376  nbr epochs :  100  nbr_neurones :  50


KeyboardInterrupt: 

# Demonstration : 

* ### Reading the demo test : 

In [None]:
reviews_demo_test = []
for line in open('movie_data/demo_test.txt', 'r', encoding="utf8"):
    reviews_demo_test.append(line.strip())

* ### Printing the demo_test reviews : 

In [None]:
print(reviews_demo_test)

* ### Removing unwanted characters : 

In [None]:
REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
NO_SPACE = ""
SPACE = " "

def preprocess_reviews(reviews):
    
    reviews = [REPLACE_NO_SPACE.sub(NO_SPACE, line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(SPACE, line) for line in reviews]
    
    return reviews

reviews_demo_test_clean = preprocess_reviews(reviews_demo_test)

* ### Printing the demo_test reviews after cleaning : 

In [None]:
print(reviews_demo_test_clean[0])

* ### Vectorizing the demo_test : 

In [None]:
X_demo_test = cv.transform(reviews_demo_test_clean)

* ### Function to assert if a review is positive or negative

In [None]:
def NegPos(x):
    x=classif(x)
    if x==0 :
        return 'Negative review'
    else : 
        return 'Positive review'

* ### Training : 

Parameters : 

In [None]:
final_nbr_neurones = 100
final_nbr_epochs = 200

In [None]:
x=X_train
n1,n2=x.shape
binarizer_ = LabelBinarizer().fit(y_train)
y = binarizer_.transform(y_train)

#100 neurones dans la couche cachée, n2 features
w1 = numpy.random.randn(n2,final_nbr_neurones)
w2 = numpy.random.randn(final_nbr_neurones,1)
dw1 = numpy.random.randn(n2,final_nbr_neurones)
dw2 = numpy.random.randn(final_nbr_neurones,1)
l2 = 0

#nombre epochs
for k in range(final_nbr_epochs):

            #feedforward pour les couches l1 et l2 (relu l1 et sigmoid l2)
            l1 = relu(x.dot(w1))
            l2 = sigmoid(l1.dot(w2))


            #backprop
            dw2 = l1.T.dot(2*(y - l2) * sigmoid_derivative(l2))
            dw1 = x.T.dot(numpy.dot(2*(y - l2) * sigmoid_derivative(l2), w2.T) * relu_derivative(l1))

            #vitesse de descente
            w1 += 0.005*dw1
            w2 += 0.005*dw2

* ### Prediction

In [None]:
l1 = sigmoid(X_demo_test.dot(w1))
l2 = sigmoid(l1.dot(w2))

for i in range (len(reviews_demo_test_clean)):
    print ('the review number', i+1, 'is a', NegPos(l2[i]))