# Classification of movies opinions

# Pre-traitements

## Group AA
- KACIOUI Arezki
- KHEFFACHE Cherif
- SHIRALI POUR Amir

## Import libs

In [1]:
import re
import nltk
import json
import numpy
import pandas
import warnings
import unicodedata
import contractions
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
# nltk.download("stopwords")
# nltk.download("punkt")
# nltk.download("wordnet")

## Import Classifiers

In [3]:
import sklearn
from sklearn.svm import SVC
from unidecode import unidecode
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

### One time installed tools

In [4]:
# pip install contractions
# pip install treetaggerwrapper
# pip install unidecode
# pip install treetaggerwrapper

### Constants

In [5]:
exceptionStopWords = ['no', 'not', 'nor', 'down', 'up', 'on', 'too', 'out']
newStopWords = set(stopwords.words('english')).difference(exceptionStopWords)

In [6]:
movieComments = pandas.read_csv('data/dataset.csv', sep = '\t', header = None, encoding = "utf8")
movieComments['lables'] = pandas.read_csv('data/labels.csv', sep = '\t', header = None, encoding = "utf8")
movieComments.columns = ['comments','lables']

Pre-traitements:

1- Remove non-ASCII characters

2- Remove contractions

3- To lowercase

4- Remove ponctuations

5- Remove stopwords

6- Remove numbers

7- Lemmatization

## Remove special caracters

In [7]:
def removeSpecialCaracters(movieComments):
    for index, comment in movieComments.iterrows():
        comment = comment['comments']
        result = re.sub('[^a-zA-Z\n\.]', ' ', comment)
        comment = re.sub(' +', ' ', result)
        # Removing non ASCII characters
        comment = unicodedata.normalize('NFKD', comment).encode("ascii", "ignore").decode("utf-8", 'ignore')
        movieComments.loc[index, 'comments'] = comment    
    return movieComments

In [8]:
movieComments = removeSpecialCaracters(movieComments)

## Remove contractions

In [9]:
def removeContractions(movieComments):
    for index, comment in movieComments.iterrows():
        comment = comment['comments']
        comment = contractions.fix(comment, slang = True)
        movieComments.loc[index, 'comments'] = comment 
    return movieComments


In [10]:
movieComments = removeContractions(movieComments)

## Remove stop words

In [11]:
def removeStopWords(movieComments):
    for index, comment in movieComments.iterrows():
        comment = comment['comments']
        removedStopWords =  [word for word in comment.split() if word.lower() not in newStopWords]
        comment = "".join([" " + i for i in removedStopWords])
        movieComments.loc[index, 'comments'] = comment 
    return movieComments

In [12]:
movieComments = removeStopWords(movieComments)

## Normalization

In [13]:
def normilizeComments(movieComments):
    for index, comment in movieComments.iterrows():
        comment = comment['comments']
        comment = [word.lower() for word in comment.split()]
        comment = " ".join(comment)
        movieComments.loc[index, 'comments'] = comment 
    return movieComments

In [14]:
movieComments = normilizeComments(movieComments)

## Tokenization and Lemmatization

In [15]:
def tokenizedText(movieComments):
    for index, comment in movieComments.iterrows():
        comment = comment['comments']
        # Tokenization
        tokenizedText = word_tokenize(comment)        
        # Lemmatization
        lemmatizer = WordNetLemmatizer()
        comment = [lemmatizer.lemmatize(word, pos = 'v') for word in tokenizedText]
        comment = " ".join([" " + i for i in comment])
        movieComments.loc[index, 'comments'] = comment 
    return movieComments

In [16]:
movieComments = tokenizedText(movieComments)

## Convert to arff extension

In [17]:
# pip install arff

import arff
arff.dump('dataset_tokenized_lemmatized.arff'
      , movieComments.values
      , relation='movieComments'
      , names=movieComments.columns)


## Vectorization

Vectorization with TF-IDF

In [17]:
def vectorization_full_text(movieComments):
    full_text = movieComments['comments'].astype(str).tolist()
    vectorizer = CountVectorizer()
    vectorizer.fit(full_text)
    vectors = vectorizer.transform(full_text)
    return vectorizer, vectors, full_text

In [18]:
vectorizer, vectors, full_text = vectorization_full_text(movieComments)

In [24]:
for k,v in vectorizer.vocabulary_.items():
    vectorizer.vocabulary_[k] = int(v)
    
with open('vectorizer.vocab_.json', 'w') as output:
    json.dump(vectorizer.vocabulary_, output)

## Perspective realize the classifications with scikit-learn