# Importing Libraries

In [None]:
pip install contractions

In [None]:
import pandas as pd
import numpy as np

# pentru grafice
import matplotlib.pyplot as plt

# forme scurte (contractions)
import contractions

# natural language toolkit
import nltk

# cuvinte comune (stopwords)
nltk.download('stopwords')
from nltk.corpus import stopwords

# impartirea documentului in cuvinte (tokenization)
nltk.download('punkt')
from nltk.tokenize import word_tokenize

# lemmatization (eliminarea pluralui / derivatelor cuvantului)
import spacy
nlp = spacy.load('en_core_web_sm')

# regex
import re

# Reading Data

In [None]:
url = 'https://raw.githubusercontent.com/DanielaManate/SentimentAnalysis-TopicModeling/master/Data/Input/1.input_data.csv'
reviews = pd.read_csv(url)
reviews.head()

In [None]:
reviews.dtypes

In [None]:
reviews.describe(include='all')

In [None]:
reviews['rating'].value_counts()

In [None]:
reviews['rating'].value_counts().sort_index()

In [None]:
reviews['rating'].value_counts().sort_index().plot(kind='bar')

In [None]:
fig = reviews['rating'].value_counts().sort_index().plot(kind='bar')

fig.set_title('Frecventa rating-urilor')
fig.set_xlabel('Rating')
fig.set_ylabel('Frecventa')

In [None]:
print(reviews.dtypes)
reviews['rating'] = reviews['rating'].astype(int)
print(reviews.dtypes)

In [None]:
reviews.head(2)

In [None]:
reviews.iloc[0, 1]

In [None]:
reviews['text'].iloc[0]

In [None]:
reviews.iloc[300, 1]

In [None]:
reviews[300:301]

In [None]:
# numarul de caractere din recenzia 301
len(reviews.iloc[300, 1])

In [None]:
reviews['nr_caractere'] = reviews['text'].apply(len)
reviews.head()

In [None]:
reviews['nr_caractere'].describe()

In [None]:
# dataframe cu toate recenziile care au rating 3
reviews[reviews['rating']==3]

In [None]:
print(len(reviews))
# Eliminam recenziile cu rating = 3
reviews = reviews[reviews['rating']!=3].reset_index(drop=True)
print(len(reviews))

In [None]:
# Adaugam coloana positive, care are valoarea 1 daca rating-ul este >3 (4, 5) 
# si valoarea 0 daca rating-ul este <3 (1,2)
reviews['positive'] = np.where(reviews['rating']>3, 1, 0)
reviews.tail(6)

In [None]:
reviews['positive'].describe()

In [None]:
print('Nr. de recenzii pozitive', len(reviews[reviews['positive']==1]))
print('Nr. de recenzii negative', len(reviews[reviews['positive']==0]))

In [None]:
reviews['positive'].value_counts()

## Lists

In [None]:
list1 = [1, 2, 3]
list2 = [1, 5, 6]

In [None]:
# Toate elementele din list1 care nu sunt in list2
[element for element in list1 if element not in list2]

# Text Normalization
*   vom converti textul in text scris cu litera mica
*   vom folosi contractions (don't -> do not)
*   vom imparti documentul in cuvinte (tokenization)
*   vom inlatura cuvintele comune (stopwords: the, a, to, etc.)
*   vom reduce cuvantul la lema (lemmatization)







In [None]:
recenzie_test = reviews['text'].iloc[0]
recenzie_test

In [None]:
# Convertim textul in text scris cu litera mica
text_procesat = recenzie_test.lower()
text_procesat

In [None]:
# Contractions
contractions.fix("can't won't don't cannot doesn't")

In [None]:
# Impartim recenzia (document) in cuvinte
text_procesat_2 = nltk.word_tokenize(text_procesat)
text_procesat_2[0:5]

In [None]:
# Cream o lista cu cuvintele comune
cuv_comune = stopwords.words('english')
print(len(cuv_comune))
cuv_comune[0:10]

In [None]:
# Inlaturam cuvintele comune din text_procesat_2
print(len(text_procesat_2))
text_procesat_3 = [element for element in text_procesat_2 if element not in cuv_comune]
print(len(text_procesat_3))
text_procesat_3[0:5]

In [None]:
# Transformam lista inapoi in propozitie
text_procesat_4 = " ".join(text_procesat_3)
text_procesat_4

In [None]:
# Reducem fiecare cuvant la lema sa
text_procesat_5 = " ".join([word.lemma_ for word in nlp(text_procesat_4)])
text_procesat_5

In [None]:
reviews.head()

In [None]:
def preprocesare_text(document):
    # Convertim textul in text scris cu litera mica
    document = document.lower()
    # Inlocuim formele scurte (contractions)
    document = contractions.fix(document)
    # Regex
    # ^ = NOT
    document = re.sub(r'[^\w\s]', ' ', document)
    document = re.sub(r'[^a-z ]', '', document)
    # Impartim documentul in cuvinte (tokenization)
    document = word_tokenize(document)
    # Inlaturam cuvintele comune (stopwords)
    cuv_comune = stopwords.words('english')
    document = [element for element in document if element not in cuv_comune]
    # Convertim din lista de cuvinte in string
    document = " ".join(document)
    # Reducem cuvantul la lema sa (Lemmatization)
    document = " ".join([word.lemma_ for word in nlp(document)])

    return document

In [None]:
print('Recenzia Initiala:', recenzie_test)
print('')
print('Recenzia Procesata:', preprocesare_text(recenzie_test))

In [None]:
reviews.head(2)

In [None]:
reviews['text_prep'] = reviews['text'].apply(preprocesare_text)
reviews.head(2)

In [None]:
reviews.to_csv('recenzii_procesate.csv', index=False)