# NLTK Tutorial  - Rule-based sentiment analysis
From: https://www.datacamp.com/tutorial/text-analytics-beginners-nltk

In [42]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [3]:
#nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /Users/biancacaissottidichiusano/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/biancacaissottidichiusano/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/biancacaissottidichiusano/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /Users/biancacaissottidichiusano/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /Users/biancacaissottidichiusano/nltk_data...
[nltk_data]    |   Unzipping grammars/basque_grammars.zip.
[nlt

OSError: [Errno 28] No space left on device

In [10]:
#nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/biancacaissottidichiusano/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [15]:
#nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/biancacaissottidichiusano/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [18]:
#nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/biancacaissottidichiusano/nltk_data...


True

In [24]:
#nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/biancacaissottidichiusano/nltk_data...


True

## Preprocessing Text

(Cleaning Data)
- Identify noise
- Noise removal- by removing stop words (and, the, of, it)
- Character normalization
- Data Masking

(Linguistic Processing)

- Tokenization - breaking down text into individual words
- POS tagging
- Lemmatization - involves reducing words to their base form based on their part of the speech
- Stemming - removing suffixes ("ing", "ed")
- Named-entity recognition

In [7]:
df = pd.read_csv('https://raw.githubusercontent.com/pycaret/pycaret/master/datasets/amazon.csv')

df

Unnamed: 0,reviewText,Positive
0,This is a one of the best apps acording to a b...,1
1,This is a pretty good version of the game for ...,1
2,this is a really cool game. there are a bunch ...,1
3,"This is a silly game and can be frustrating, b...",1
4,This is a terrific game on any pad. Hrs of fun...,1
...,...,...
19995,this app is fricken stupid.it froze on the kin...,0
19996,Please add me!!!!! I need neighbors! Ginger101...,1
19997,love it! this game. is awesome. wish it had m...,1
19998,I love love love this app on my side of fashio...,1


In [43]:
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    # Remove stop words
    filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    # Join the tokens back into a string
    processed_text = ' '.join(lemmatized_tokens)
    return processed_text

In [22]:
df['reviewText'] = df['reviewText'].apply(preprocess_text)
df

Unnamed: 0,reviewText,Positive
0,one best apps acording bunch people agree bomb...,1
1,pretty good version game free . lot different ...,1
2,really cool game . bunch level find golden egg...,1
3,"silly game frustrating , lot fun definitely re...",1
4,terrific game pad . hr fun . grandkids love . ...,1
...,...,...
19995,app fricken stupid.it froze kindle wont allow ...,0
19996,please add ! ! ! ! ! need neighbor ! ginger101...,1
19997,love ! game . awesome . wish free stuff house ...,1
19998,love love love app side fashion story fight wo...,1


In [44]:
# initialize NLTK sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# create get_sentiment function
def get_sentiment(text):
    scores = analyzer.polarity_scores(text)
    sentiment = 1 if scores['pos'] > 0 else 0
    return sentiment

# apply get_sentiment function
df['sentiment'] = df['reviewText'].apply(get_sentiment)
df

Unnamed: 0,reviewText,Positive,sentiment
0,one best apps acording bunch people agree bomb...,1,1
1,pretty good version game free . lot different ...,1,1
2,really cool game . bunch level find golden egg...,1,1
3,"silly game frustrating , lot fun definitely re...",1,1
4,terrific game pad . hr fun . grandkids love . ...,1,1
...,...,...,...
19995,app fricken stupid.it froze kindle wont allow ...,0,0
19996,please add ! ! ! ! ! need neighbor ! ginger101...,1,1
19997,love ! game . awesome . wish free stuff house ...,1,1
19998,love love love app side fashion story fight wo...,1,1


In [30]:
print(confusion_matrix(df['Positive'], df['sentiment']))

[[ 1131  3636]
 [  576 14657]]


In [31]:
# Overall accuracy of this model is 79%  ML based approach will result in better accuracy
print(classification_report(df['Positive'], df['sentiment']))

              precision    recall  f1-score   support

           0       0.66      0.24      0.35      4767
           1       0.80      0.96      0.87     15233

    accuracy                           0.79     20000
   macro avg       0.73      0.60      0.61     20000
weighted avg       0.77      0.79      0.75     20000



# Bag of Words (BoW) Model (For feature extraction)

scikit learn for bow
https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

In [45]:
import pypandoc

#Transform docx into txt:
#docxFilename = 'CASE OF LOUKILI v. THE NETHERLANDS.docx'
#output = pypandoc.convert_file(docxFilename, 'plain', outputfile="trialCase.txt")
#assert output == ""

In [46]:
# Clean the data

# I don't know if I should put everything in a csv file
# Normalise the text - make everything lowercase 
file = pandas.read_csv('trialCase.txt', sep='\n')
file.head()

Unnamed: 0,THIRD SECTION
0,CASE OF LOUKILI v. THE NETHERLANDS
1,(Application no. 57766/19)
2,JUDGMENT
3,Art 8 • Expulsion • Family life • Revocation o...
4,long-term settled migrant of Moroccan national...


### ? Lemmatization or stemming?

In [47]:
# Applying Normalisation, Noise removal and lemmatization
file['THIRD SECTION'] = file['THIRD SECTION'].apply(preprocess_text)
file

Unnamed: 0,THIRD SECTION
0,case loukili v. netherlands
1,( application . 57766/19 )
2,judgment
3,art 8 • expulsion • family life • revocation r...
4,long-term settled migrant moroccan nationality...
...,...
954,ban would violation article 8 convention .
955,"done english , notified writing 11 april 2023 ..."
956,rule 77 §§ 2 3 rule court .
957,milan blaško pere pastor vilanova


In [52]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
    ]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names_out()

AttributeError: 'CountVectorizer' object has no attribute 'get_feature_names_out'

# Machine Learning-based model
These models can be trained using a wide range of ML algorithms, including decisions trees, support vector machines (SVMs), and neural networks.

ML-based approaches can be more accurate than rule-based analysis especially when dealing with more complex text data, but require a larger amount oflabeled training data and may be more computationally expensive.

## Ideas
For sentiment Analysis:
I could do a csv file with line by line (sentence split) of the judgment part and do a sentiment analysis 