In [20]:
#MODULE IMPORTATION
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

import re
import unicodedata
import pickle

from time import time
from datetime import datetime

import pandas as pd
import numpy as np
import contractions
import inflect

from collections import defaultdict

from nltk import pos_tag
from nltk import punkt
from nltk.corpus import stopwords, wordnet as wn
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.utils import shuffle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline

##### UNCOMMENT THIS SECTION IF FIRT TIME RUNNING
# import nltk
# nltk.download('wordnet')
# nltk.download('stopwords')
#####

# Set seed for random results base calculation
np.random.seed(500)

In [21]:
#DEFINITION OF SOME OF THE RESOURCES USED THROUGHOUT THIS NOTEBOOK
DATA_PATH = 'Dataset/dataset.csv'
TARGET_PATH = 'Dataset/labels.csv'
PIPELINE_PATH = 'pipeline.pkl'

#DEFINITION OF IMPORTATION, MERGING AND SHUFFLING FUNCTIONS OF DATASETS
def import_dataset(dataset_path, importation_message=None, sep='\t', names=None):
    """
    imports a dataset from a given path
    returns the dataframe containing the imported dataset
    """
    print("\n{}".format(importation_message))
    df = pd.read_csv(dataset_path, sep=sep, header=None, names=names, encoding='utf-8')
    print('Size : {}'.format(df.shape))
    print('Head of imported dataset :')
    display(df.head())
    
    return df
    
def merge_datasets(df1, df2):
    """
    merges datasets contained within dataframes df1 and df2
    returns a new dataframe containing the merged datasets
    """
    
    df = df1.join(df2)
    
    print('Size : {}'.format(df.shape))
    print('Head of merged dataset :')
    display(df.head())
    
    return df

def shuffle_dataset(df):
    """
    shuffles dataset entries and reset indexes
    returns a new dataframe containing the shuffled dataset with the reset indexes
    """
    shuffled_df = shuffle(df)
    shuffled_df.reset_index(inplace = True, drop = True)
    
    print('Head of shuffled dataset :')
    display(shuffled_df.head())
    
    return shuffled_df

#Importation of opinion dataset
df_avis = import_dataset(DATA_PATH, importation_message="\nDataframe des avis", sep='\t', names=['Avis'])

#Importation of scores dataset
df_score = import_dataset(TARGET_PATH, importation_message='\nDataframe des scores', sep='\t', names=['Score'])

#Merging of both datasets
df = merge_datasets(df_avis, df_score)

#Shuffling of the merged dataset
df2 = shuffle_dataset(df)



Dataframe des avis
Size : (10000, 1)
Head of imported dataset :


Unnamed: 0,Avis
0,Obviously made to show famous 1950s stripper M...
1,This film was more effective in persuading me ...
2,Unless you are already familiar with the pop s...
3,From around the time Europe began fighting Wor...
4,Im not surprised that even cowgirls get the bl...




Dataframe des scores
Size : (10000, 1)
Head of imported dataset :


Unnamed: 0,Score
0,-1
1,-1
2,-1
3,-1
4,-1


Size : (10000, 2)
Head of merged dataset :


Unnamed: 0,Avis,Score
0,Obviously made to show famous 1950s stripper M...,-1
1,This film was more effective in persuading me ...,-1
2,Unless you are already familiar with the pop s...,-1
3,From around the time Europe began fighting Wor...,-1
4,Im not surprised that even cowgirls get the bl...,-1


Head of shuffled dataset :


Unnamed: 0,Avis,Score
0,After having read two or three negative review...,1
1,I recently (May 2008) discovered that this chi...,1
2,"Pathetic is the word. Bad acting, pathetic scr...",-1
3,Spencer Tracy and Katherine Hepburn would roll...,-1
4,This in my opinion is one of the best action m...,1


In [22]:
#DEFINITION OF SOME OF THE CONSTANTS USED THROUGHOUT THIS NOTEBOOK
STOP_WORDS = set(stopwords.words('english'))
STOP_WORDS_EXCEPTIONS = set(('not',))

#POS-TAG dictionary that will be used during the lemmatization process
POS_TAG_MAP = defaultdict(lambda : wn.NOUN)
POS_TAG_MAP['J'] = wn.ADJ
POS_TAG_MAP['V'] = wn.VERB
POS_TAG_MAP['R'] = wn.ADV

#parameters used by by the cross validation score function
CV_SEED = 7 #seed used for random selection of partitions during cross validation
CV_SCORING = 'accuracy'

#parameters used by the training/set generator
TTS_VALIDATION_SIZE = 0.3 #30% of dataset used for training
TTS_TEST_SIZE = 1 - TTS_VALIDATION_SIZE #70% of dataset used for testing
TTS_SEED = 30 #seed used for random selection of training/test sets

#parameters used by the gridsearch function
GRDSR_SCORING = 'accuracy'

#

#DEFINITION OF PREPROCESSING FUNCTIONS
def replace_contractions(document):
    """
    replaces contracted expressions in a document
    
    returns document with no contracted expressions
    """
    return contractions.fix(document)

def remove_urls(document):
    """
    removes all urls in the document
    
    return a document without any urls
    """
    return re.sub(r'https?://(www\.)?[-\w@:%.\+~#=]{2,256}\.[a-z]{2,6}\b([-\w@:%_\+.~#?&/=;]*)', '', document)

def clean_sentences(document):
    '''
    cleans all sentences within a document, such that
    the end of a sentence and the beginning of a new one is separated by a period (or many)
    followed by a whitespace
    This cleaning is required because upon removing punctuation,
    some words get concatenated and create new meaningless terms
    
    example of a dirty document: "This is a dirty sentence.Another dirty sentence begins"
    cleaned version: "This is a cleaned sentence. Another cleaned sentence begins"
    
    This pattern repeats with a sentence ending with a lowercase/uppercase letter and
    another one beginning with a lowercase/uppercase letter
    The beginning sentence could also end with a digit and the next sentence could begin with
    a digit. Hence we get three different patterns:
    word.*word
    word.*digit
    digit.*word
    
    returns a document with cleaned sentences
    '''
    word_word = r'([a-zA-Z]+\.*)\.([a-zA-Z]+)'
    word_digit = r'([a-zA-Z]+\.*)\.(\d+)'
    digit_word = r'(\d+\.*)\.([a-zA-Z]+)'
    patterns = [
        word_word, #word.word pattern
        word_digit, #word.digit pattern
        digit_word, #digit.word pattern
    ]
    
    for pattern in patterns:
        if re.search(pattern, document):
            document = re.sub(pattern, r'\1. \2', document)
    
    return document

def remove_non_ascii(tokens):
    '''
    normalizes the tokens
    encodes tokens as ASCII characters from tokens
    and decodes as utf-8
    
    returns a list of normalized and encoded as ascii tokens
    '''
    return [unicodedata.normalize('NFKD', token)
           .encode('ascii', 'ignore')
           .decode('utf-8', 'ignore')
           for token in tokens]

def split_on_characterset(tokens, regex):
    '''
    splits a token in tokens upon matching with the characterset defined by the regex
    appends the tokens obtained from splitting the token to the tokens list
    
    returns a list of all tokens obtained after splitting problematic tokens
    '''
    new_tokens = []
    for token in tokens:
        if re.search(regex, token) :
            new_tokens += re.split(regex, token)
        else:
            new_tokens.append(token)
    return new_tokens

def to_lowercase(tokens):
    """returns a list of tokens in lowercase"""
    return [token.lower() for token in tokens]

def replace_numbers(tokens):
    """
    replaces tokens representing whole numeric values
    by their equivalent letter values
    
    returns a list of transformed tokens
    """
    engine = inflect.engine()
    new_tokens = []
    for token in tokens:
        new_token = token
        if token.isdigit():
            new_token = engine.number_to_words(token)
        new_tokens.append(new_token)
    return new_tokens

def remove_punctuation(tokens):
    """
    removes tokens not in \w and \s classes of characters.
    by extension, all punctuation characters will be removed
    
    returns a list of tokens only in \w and \s
    """
    new_tokens = []
    for token in tokens:
        new_token = re.sub(r'[^\w\s]', '', token)
        if new_token != '':
            new_tokens.append(new_token)
    return new_tokens

def remove_stopwords(tokens, stopwords, exceptions):
    '''
    removes all stopwords (a set) from tokens (a list)
    except those in exceptions (a set)
    
    returns a list of tokens that are not stopwords
    '''
    stop = stopwords - exceptions
    return [token for token in tokens if token not in stop]

def lemmatize(tokens, lemmatizer, pos_tag_map):
    '''
    lematizes all tokens using a lemmatizer and a POS-Tagging map
    
    returns the list of lemmatized tokens
    '''
    return [lemmatizer.lemmatize(token, pos_tag_map[tag[0]]) for token, tag in pos_tag(tokens)]
    
def normalize(tokens):
    '''
    normalizes all the tokens by using all preprocessing
    functions taking a list of tokens as input
    
    returns the list of normalized tokens
    '''
    tokens = remove_non_ascii(tokens)
    tokens = to_lowercase(tokens)
    tokens = split_on_characterset(tokens, r'[/\\~_-]')
    tokens = replace_numbers(tokens)
    tokens = remove_punctuation(tokens)
    tokens = remove_stopwords(tokens, STOP_WORDS, STOP_WORDS_EXCEPTIONS)
    tokens = lemmatize(tokens, WordNetLemmatizer(), POS_TAG_MAP)
    return tokens

def preprocess(document):
    '''
    preprocesses and tokenizes the document
    normalizes the document's tokens
    and finally joins the normalized tokens of a document
    to prepare it for vectorization
    
    returns a preprocessed document, ready for vectorization
    '''
    
    document = replace_contractions(document)
    document = remove_urls(document)
    document = clean_sentences(document)
    tokens = word_tokenize(document)
    tokens = normalize(tokens)
    document = ''.join([" " + token for token in tokens]).strip() 
    return document

def preprocess_dataset(corpus):
    '''
    preprocesses all documents in a corpus designating a dataset
    returns a corpus with preprocessed documents
    and ready for vectorization
    '''
    return [preprocess(document) for document in corpus]

In [23]:
#PREPROCESSING DATASET
df_transformed = df2.copy() #creating a new copy of the dataset that will be preprocessed
df_transformed['Avis'] = preprocess_dataset(df_transformed['Avis']) #preprocessing of opinions column
display(df_transformed['Avis'].head())

0    read two three negative review main page imdb ...
1    recently may two thousand and eight discover c...
2    pathetic word bad act pathetic script cheezy d...
3    spencer tracy katherine hepburn would roll gra...
4    opinion one best action movie 1970s not featur...
Name: Avis, dtype: object

In [74]:
#VECTORIZATION

#Splitting the dataset prior to vectorization, to prevent memory related errors during processing
df_transformed1 = df_transformed.iloc[:5000]

#vectorization of the opinions column
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(df_transformed1['Avis'])

In [49]:
#CROSS VALIDATION USING ACCURACY METRIC

#choosing the data (opinions) and target (score) columns in the dataset
X = vectors.toarray()
y = df_transformed1['Score']

#dictionary containing the models to cross validate using their default parameters
models = {
    'LogisticRegression': LogisticRegression(),
    'SGDClassifier': SGDClassifier(),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'RandomForestClassifier': RandomForestClassifier(),
    'GaussianNB': GaussianNB(),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'SVC': SVC(),
    'LinearSVC': LinearSVC()
}

#configuring the parameters used by the cross validation function
k_fold = KFold(n_splits=10, shuffle=True, random_state=CV_SEED)

#cross validation using accuracy metric
#for each defined model
for name, model in models.items():
    start_time = time()
    print('Cross validation started at {}'.format(datetime.now()))
    cv_score = cross_val_score(model, X, y, cv=k_fold, scoring=CV_SCORING)
    output = """
    Time taken to complete cross validation of {}: {} seconds
    Accuracy scores over 10 evaluations: {}
    Mean score: {}
    Standard deviation of scores: {}
    """.format(name, time() - start_time, cv_score, cv_score.mean(), cv_score.std())
    print(output)

In [44]:
#RESULTS OF THE CROSS VALIDATION ON BOTH PARTITIONS

#FIRST PARTITION [0:5000]
########################
# Cross validation started at 2019-04-12 13:56:55.212174
# 
#     Time taken to complete cross validation of LogisticRegression: 7.469494581222534 seconds
#     Accuracy scores over 10 evaluations: [0.908 0.89  0.894 0.9   0.884 0.916 0.892 0.91  0.906 0.886]
#     Mean score: 0.8986000000000001
#     Standard deviation of scores: 0.01043264108459599
# 
# Cross validation started at 2019-04-12 13:57:02.682150
# 
#     Time taken to complete cross validation of SGDClassifier: 16.67750573158264 seconds
#     Accuracy scores over 10 evaluations: [0.916 0.88  0.854 0.858 0.85  0.916 0.812 0.912 0.908 0.812]
#     Mean score: 0.8718
#     Standard deviation of scores: 0.038775765627515335
#
# Cross validation started at 2019-04-12 14:07:25.906064
# 
#     Time taken to complete cross validation of DecisionTreeClassifier: 443.644727230072 seconds
#     Accuracy scores over 10 evaluations: [0.758 0.722 0.754 0.728 0.734 0.78  0.726 0.742 0.736 0.75 ]
#     Mean score: 0.743
#     Standard deviation of scores: 0.016881943016134146
#
# Cross validation started at 2019-04-12 14:14:49.551167
# 
#     Time taken to complete cross validation of RandomForestClassifier: 44.67994499206543 seconds
#     Accuracy scores over 10 evaluations: [0.77  0.778 0.774 0.766 0.79  0.784 0.804 0.764 0.78  0.772]
#     Mean score: 0.7782000000000001
#     Standard deviation of scores: 0.011469960767151744
# 
# Cross validation started at 2019-04-12 14:00:25.789652
# 
#     Time taken to complete cross validation of GaussianNB: 28.69598889350891 seconds
#     Accuracy scores over 10 evaluations: [0.676 0.642 0.69  0.68  0.638 0.7   0.66  0.682 0.68  0.672]
#     Mean score: 0.6719999999999999
#     Standard deviation of scores: 0.018846750383023584
# 
# Cross validation started at 2019-04-12 14:35:25.204313
# 
#     Time taken to complete cross validation of KNeighborsClassifier: 1241.7990498542786 seconds
#     Accuracy scores over 10 evaluations: [0.794 0.782 0.782 0.804 0.764 0.788 0.806 0.824 0.776 0.796]
#     Mean score: 0.7916
#     Standard deviation of scores: 0.01624315240339756
# 
# Cross validation started at 2019-04-12 14:56:07.003706
# 
#     Time taken to complete cross validation of LinearSVC: 7.24153470993042 seconds
#     Accuracy scores over 10 evaluations: [0.916 0.888 0.902 0.91  0.902 0.932 0.904 0.91  0.912 0.902]
#     Mean score: 0.9077999999999999
#     Standard deviation of scores: 0.010897706180660232

#SECOND PARTITION [5000:10000]
##############################
# Cross validation started at 2019-04-12 13:58:46.938275
# 
#     Time taken to complete cross validation of LogisticRegression: 7.57363748550415 seconds
#     Accuracy scores over 10 evaluations: [0.902 0.912 0.894 0.912 0.894 0.89  0.904 0.912 0.908 0.892]
#     Mean score: 0.9019999999999999
#     Standard deviation of scores: 0.008438009243891603
# 
# Cross validation started at 2019-04-12 13:58:54.512393
# 
#     Time taken to complete cross validation of SGDClassifier: 16.909173488616943 seconds
#     Accuracy scores over 10 evaluations: [0.924 0.91  0.848 0.882 0.89  0.9   0.882 0.91  0.922 0.9  ]
#     Mean score: 0.8968
#     Standard deviation of scores: 0.021469979040511445
#
# Cross validation started at 2019-04-12 14:18:27.835834
# 
#     Time taken to complete cross validation of DecisionTreeClassifier: 453.8783338069916 seconds
#     Accuracy scores over 10 evaluations: [0.706 0.712 0.696 0.77  0.718 0.74  0.74  0.76  0.706 0.736]
#     Mean score: 0.7283999999999999
#     Standard deviation of scores: 0.02342306555513178
# 
# Cross validation started at 2019-04-12 14:26:01.714540
# 
#     Time taken to complete cross validation of RandomForestClassifier: 46.442110776901245 seconds
#     Accuracy scores over 10 evaluations: [0.75  0.796 0.8   0.764 0.764 0.756 0.772 0.776 0.808 0.738]
#     Mean score: 0.7724
#     Standard deviation of scores: 0.021666564102321366
#
# Cross validation started at 2019-04-12 14:04:24.184877
# 
#     Time taken to complete cross validation of GaussianNB: 29.050445556640625 seconds
#     Accuracy scores over 10 evaluations: [0.68  0.684 0.648 0.698 0.688 0.67  0.704 0.662 0.664 0.664]
#     Mean score: 0.6761999999999999
#     Standard deviation of scores: 0.016720047846821465
#
# Cross validation started at 2019-04-12 15:00:11.372601
# 
#     Time taken to complete cross validation of KNeighborsClassifier: 1274.044404745102 seconds
#     Accuracy scores over 10 evaluations: [0.784 0.78  0.776 0.82  0.754 0.774 0.772 0.782 0.81  0.768]
#     Mean score: 0.7819999999999999
#     Standard deviation of scores: 0.018482424083436668
# 
# Cross validation started at 2019-04-12 15:21:25.417337
# 
#     Time taken to complete cross validation of LinearSVC: 7.582218647003174 seconds
#     Accuracy scores over 10 evaluations: [0.916 0.928 0.912 0.916 0.898 0.908 0.898 0.908 0.928 0.896]
#     Mean score: 0.9108000000000003
#     Standard deviation of scores: 0.010998181667894026

In [50]:
#GRIDSEARCH USING THE ACCURACY METRIC FOR PARAMETERS TUNING

#based on the cross-validation results, using KFold over 10 partitions
#the models LogisticRegression and LinearSVC are best suited for the job

#dictionary containing the candidate models that will be used 
#for parameters tuning using a GridSearchCV
candidates = {
    'LogisticRegression': models['LogisticRegression'],
    'LinearSVC': models['LinearSVC']
}

#dictionary of the hyperparameters to be tuned for each model
grid_params = {
    'LogisticRegression': [
        {'C': np.logspace(-3,3,7)},
        {'penalty': ['l1','l2']}
    ],
    'LinearSVC': [
        {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]}
    ]
}

#class used to encapsulate the results of the gridsearch
class GridSearchResult:
    
    def __init__(self, name, score, estimator):
        self.name = name
        self.score = score
        self.estimator = estimator
    
    def __str__(self):
        return '''
        Model: {}
        Best Accuracy Score: {}
        Best Estimator: {}
        '''.format(self.name, self.score, self.estimator)

#generation of training/test sets
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    train_size = TTS_VALIDATION_SIZE,
    test_size = TTS_TEST_SIZE,
    random_state = TTS_SEED
)

#GridSearchCV for every candidate classifier
grid_search_results = []
for name, model in candidates.items():
    #creation of the gridsearch
    grd_sr = GridSearchCV(
        estimator = model,
        param_grid = grid_params[name],
        scoring = GRDSR_SCORING,
        cv = 5,
        n_jobs = -1,
        iid = True,
        return_train_score = True
    )
    
    #execution of the gridsearch
    start_time = time()
    print('Grid search started at {}'.format(datetime.now()))
    grd_sr.fit(X_train, y_train)
    print('\nTime taken to complete Grid search of {}: {} seconds'.format(name, time() - start_time))
    grd_sr_result = GridSearchResult(name, grd_sr.best_score_, grd_sr.best_estimator_)
    print(grd_sr_result)
    grid_search_results.append(grd_sr_result)

#Sorting the results by descending order on the score attribute of the GridSearchResult objects
#to get the best candidate with the best parameters
grid_search_results = sorted(grid_search_results, key=lambda result: result.score, reverse=True)
clf = grid_search_results[0].estimator #best candidate with the best parameters

Grid search started at 2019-04-12 15:52:13.070429

Time taken to complete Grid search of LogisticRegression: 31.332935571670532 seconds

        Model: LogisticRegression
        Best Accuracy Score: 0.8666666666666667
        Best Estimator: LogisticRegression(C=1000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=None,
          solver='warn', tol=0.0001, verbose=0, warm_start=False)
        
Grid search started at 2019-04-12 15:52:44.404189

Time taken to complete Grid search of LinearSVC: 13.616746187210083 seconds

        Model: LinearSVC
        Best Accuracy Score: 0.8686666666666667
        Best Estimator: LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
        


In [77]:
#PIPELINE CREATION

#creating the pipeline instance
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(preprocessor=preprocess)),
    ('clf', clf)
])

#choosing data and target columns from initial dataset
df_pipeline = df2.iloc[:5000]
X = df_pipeline['Avis']
y = df_pipeline['Score']

#generating the training/test sets from the initial dataset
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    train_size = TTS_VALIDATION_SIZE,
    test_size = TTS_TEST_SIZE,
    random_state = TTS_SEED
)

#learning the model using the pipeline
start_time = time()
print('Pipeline execution started at {}'.format(datetime.now()))
pipeline.fit(X_train, y_train)
print('\nTime taken to complete pipeline execution: {} seconds'.format(time() - start_time))

#predicting the targets of test data
start_time = time()
print('\nPrediction started at {}'.format(datetime.now()))
prediction_result = pipeline.predict(X_test)
print('\nTime taken to complete prediction: {} seconds'.format(time() - start_time))

#printing the accuracy, confusion matrix and classification report
#of the classifier in the pipeline
accuracy = accuracy_score(prediction_result, y_test)
conf = confusion_matrix(y_test, prediction_result)
report = classification_report(y_test, prediction_result)
print('''
Accuracy: {}
Confusion Matrix
{}

Classification Report
{}
'''.format(accuracy, conf, report))

Pipeline execution started at 2019-04-12 17:13:17.557336

Time taken to complete pipeline execution: 15.977086544036865 seconds

Prediction started at 2019-04-12 17:13:33.534615

Time taken to complete prediction: 34.844505071640015 seconds

Accuracy: 0.892
Confusion Matrix
[[1547  190]
 [ 188 1575]]

Classification Report
              precision    recall  f1-score   support

          -1       0.89      0.89      0.89      1737
           1       0.89      0.89      0.89      1763

   micro avg       0.89      0.89      0.89      3500
   macro avg       0.89      0.89      0.89      3500
weighted avg       0.89      0.89      0.89      3500




In [78]:
#PIPELINE SAVING
print('Saving the pipeline containing the trained model')
pickle.dump(pipeline, open(PIPELINE_PATH, 'wb'))

Saving the pipeline containing the trained model


In [92]:
#PIPELINE LOADING

#Loading the pipeline containing the trained classifier
clf_loaded = pickle.load(open(PIPELINE_PATH, 'rb'))

#some test data and real targets
data_df = import_dataset("imdb_reviews.csv", 'IMDB Opinions Dataset', sep='\t', names=['Avis'])
test_target = []
for i in range(2000):
    if i < 1000:
        test_target.append(-1)
    else:
        test_target.append(1)

data_df['Score'] = test_target
data_df = shuffle_dataset(data_df)

#prediction of data
print('Prediction of test data...')
prediction_results = clf_loaded.predict(data_df['Avis'])
# print('Comparison between real and predicted values')
# for i in range(len(prediction_results)):
#     print('''
#     Avis: {}
#     Real: {}
#     Predicted: {}\n
#     '''.format(data_df['Avis'][i], test_target[i], -1))


#comparison between real and predicted values
print('Accuracy: {}'.format(accuracy_score(prediction_results, test_target)))
print('Confusion Matrix: {}'.format(confusion_matrix(test_target, prediction_results)))
print('Classification report:')
print(classification_report(test_target, prediction_results))


IMDB Opinions Dataset
Size : (2000, 1)
Head of imported dataset :


Unnamed: 0,Avis
0,"Firstly, few colleges allow students to take c..."
1,"For years, I've been a big fan of Park's work ..."
2,...now please move on because that's getting o...
3,"This was shown on a premium channel, so I didn..."
4,"Before I start to tear apart this movie, mark ..."


Head of shuffled dataset :


Unnamed: 0,Avis,Score
0,It may interest people to know that this film ...,1
1,"One of the better made for TV biopics, I just ...",1
2,"Writer & director Jay Andrews, a.k.a. Jim Wyno...",-1
3,This movie probably isn't the funniest I've ev...,1
4,Bank heist / Cop thriller sounds OK right?<br ...,-1


Prediction of test data...
Accuracy: 0.469
Confusion Matrix: [[436 564]
 [498 502]]
Classification report:
              precision    recall  f1-score   support

          -1       0.47      0.44      0.45      1000
           1       0.47      0.50      0.49      1000

   micro avg       0.47      0.47      0.47      2000
   macro avg       0.47      0.47      0.47      2000
weighted avg       0.47      0.47      0.47      2000

