# Sentiment Analysis
Amazon Review Dataset classificated as positive or negative  

Dataset : https://raw.githubusercontent.com/pycaret/pycaret/master/datasets/amazon.csv

In [11]:
# import libraries
import pandas as pd
import nltk


from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download("vader_lexicon")
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\alperugurcan\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alperugurcan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\alperugurcan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\alperugurcan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\alperugurcan\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [12]:
# import dataset
url = "https://raw.githubusercontent.com/pycaret/pycaret/master/datasets/amazon.csv"

df = pd.read_csv(url)

df.to_csv("amazon.csv", index=False)


In [13]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   reviewText  20000 non-null  object
 1   Positive    20000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 312.6+ KB


In [14]:
# text cleaning and preprocessing
lemmatizer = WordNetLemmatizer()
def clean_and_preprocess_data(text):

    # tokenize the text
    tokens = word_tokenize(text.lower())

    # stop words
    filtered_tokens = [token for token in tokens if token not in stopwords.words("english")]

    #lemmatize 
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    # join the tokens back into a string
    processed_text = " ".join(lemmatized_tokens)

    return processed_text

df["reviewText2"] = df["reviewText"].apply(clean_and_preprocess_data)


In [15]:
df.head(10)

Unnamed: 0,reviewText,Positive,reviewText2
0,This is a one of the best apps acording to a b...,1,one best apps acording bunch people agree bomb...
1,This is a pretty good version of the game for ...,1,pretty good version game free . lot different ...
2,this is a really cool game. there are a bunch ...,1,really cool game . bunch level find golden egg...
3,"This is a silly game and can be frustrating, b...",1,"silly game frustrating , lot fun definitely re..."
4,This is a terrific game on any pad. Hrs of fun...,1,terrific game pad . hr fun . grandkids love . ...
5,This is a very entertaining game! You don't h...,1,entertaining game ! n't smart play . guess 's ...
6,this is awesome and you don't need wi ti to pl...,1,awesome n't need wi ti play trust . really fun...
7,this is awesome I bet no one even reads the re...,1,awesome bet one even read review know game goo...
8,This is basicly the free version but with ads....,1,basicly free version ad . 's actually awesome ...
9,this is by far the best free app that is avail...,1,far best free app available anywhere . helped ...


In [17]:
# sentiment analysis
analyzer = SentimentIntensityAnalyzer()

def get_sentiments(text):
    
    sentiment_scores = analyzer.polarity_scores(text)

    sentiment = 1 if sentiment_scores["pos"] > 0 else 0

    return sentiment

df["sentiment"] = df["reviewText2"].apply(get_sentiments)

In [18]:
text = "I love this product"

print(analyzer.polarity_scores(text))

{'neg': 0.0, 'neu': 0.323, 'pos': 0.677, 'compound': 0.6369}


In [19]:
# evaluation - test
from sklearn.metrics import confusion_matrix, classification_report

cm = confusion_matrix(df["Positive"], df["sentiment"])

print(cm)



[[ 1131  3636]
 [  576 14657]]


In [20]:
cf = classification_report(df["Positive"], df["sentiment"])

print(cf)


              precision    recall  f1-score   support

           0       0.66      0.24      0.35      4767
           1       0.80      0.96      0.87     15233

    accuracy                           0.79     20000
   macro avg       0.73      0.60      0.61     20000
weighted avg       0.77      0.79      0.75     20000

