# Importing Libraries

In [None]:
import pandas as pd
import numpy as np

# DTM (document-term-matrix)
from sklearn.feature_extraction.text import CountVectorizer

# Impartirea documentului in cuvinte
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

# Grafice
import matplotlib.pyplot as plt

# WordCloud
from wordcloud import WordCloud

# Citirea Datelor

In [None]:
url = 'https://raw.githubusercontent.com/DanielaManate/SentimentAnalysis-TopicModeling/master/Data/Input/2.input_data_prepped.csv'
reviews = pd.read_csv(url)
reviews.head(2)

In [None]:
# len(reviews)
reviews.shape

In [None]:
reviews.dtypes

In [None]:
# Numarul valorilor nule pentru fiecare coloana
reviews.isnull().sum()

In [None]:
# reviews[reviews['text_prep'].isnull()]

In [None]:
reviews['text_prep'] = reviews['text_prep'].astype(str)

In [None]:
reviews.isnull().sum()

# Document Term Matrix

In [None]:
# Construim un vocabular cu cele mai frecvente 6000 de cuvinte
vec = CountVectorizer(max_features=6000)

# Cream DTM-ul
dtm1 = vec.fit_transform(reviews['text_prep'])

# Convertim in dataframe
dtm = pd.DataFrame(dtm1.toarray(),
                   columns=vec.get_feature_names_out())

In [None]:
dtm.head(2)

In [None]:
reviews['text_prep'].iloc[1000]

In [None]:
# Cuvantul 'never' apare o singura data in recenzia #1000
dtm['never'].iloc[1000]

In [None]:
# Cuvantul 'robbie' nu este in top 6000 de cuvinte, 
# din aceasta cauza nu exista coloana robbie in dtm
# dtm['robbie'].iloc[1000]

In [None]:
reviews.head(2)

In [None]:
reviews['text_prep_tokens'] = reviews['text_prep'].apply(word_tokenize)
reviews['word_len_prep'] = reviews['text_prep_tokens'].apply(len)
reviews.head()

In [None]:
# Cele top 6000 de cuvinte din DTM
top6000 = vec.get_feature_names_out()
print(len(top6000))

In [None]:
# Cream o noua coloaana, care contine doar cuvintele din cele 
# top 6000 de cuvinte
reviews['text_prep_lim_tokens'] = reviews['text_prep_tokens'].apply(lambda x:
                                                                    [cuvant for cuvant in x if cuvant in top6000])
reviews.head(2)

In [None]:
reviews['word_len_prep_lim'] = reviews['text_prep_lim_tokens'].apply(len)
reviews.head(2)

In [None]:
reviews['text_prep_lim'] = reviews['text_prep_lim_tokens'].apply(lambda x: ' '.join(x))
reviews.head(2)

# WordCloud

In [None]:
print(dtm.shape)
dtm.head(1)

In [None]:
word_cloud_frecventa = dtm.sum(axis=0)
word_cloud_frecventa

In [None]:
wordcloud = WordCloud(background_color='white',
                      max_words=100,
                      colormap='Blues')
wordcloud.generate_from_frequencies(word_cloud_frecventa)
plt.imshow(wordcloud)
plt.axis('off');

# Word Cloud Positive - Top 20

In [None]:
reviews.shape

In [None]:
dtm.shape

In [None]:
# DTM doar cu recenziile pozitive
dtm_pos = dtm[reviews['positive']==1].copy()
dtm_pos.shape

In [None]:
# Frecventa cuvintelor din recenziile pozitive
word_cloud_frecventa_pos = dtm_pos.sum(axis=0)
word_cloud_frecventa_pos

In [None]:
wordcloud_pos = WordCloud(background_color='white',
                      max_words=20,
                      colormap='Greens')
wordcloud_pos.generate_from_frequencies(word_cloud_frecventa_pos)
plt.imshow(wordcloud_pos)
plt.axis('off');

In [None]:
wordcloud_pos.words_

Tema 1 (10% din nota finala)
*   Creati un wordcloud pentru top 20 de cuvinte din recenziile negative
*   File - Download - Download .py
*   Deadline: 12 Mar



# Frecventa Cuvintelor

In [None]:
type(word_cloud_frecventa)

In [None]:
frecventa_cuvinte = pd.DataFrame(word_cloud_frecventa,
                                 columns=['Frecventa']).reset_index()
frecventa_cuvinte.head(2)

In [None]:
top10 = frecventa_cuvinte.sort_values(by='Frecventa', 
                                      ascending=False).head(10)
top10

In [None]:
plt.barh(y=top10['index'],
         width=top10['Frecventa'])
plt.title('Top 10 Cuvinte din toate recenziile')
plt.xlabel('Frecventa')
plt.ylabel('Cuvant')
plt.show()

# Lungimea cuvantului

In [None]:
frecventa_cuvinte['lungime_cuvant'] = frecventa_cuvinte['index'].apply(len)
frecventa_cuvinte.head()

In [None]:
frecventa_cuvinte['lungime_cuvant'].describe()

In [None]:
frecventa_cuvinte[frecventa_cuvinte['lungime_cuvant']>=15]

In [None]:
frecventa_cuvinte['Frecventa'].describe()

In [None]:
frecventa_cuvinte[frecventa_cuvinte['Frecventa']>=5000]

# Diversitate Lexicala: nr. de cuvinte unice pentru fiecare document

In [None]:
dtm.head(1)

In [None]:
lexical = pd.DataFrame(np.count_nonzero(dtm, axis=1),
                       columns=['Cuvinte Unice'])
lexical.head(2)

In [None]:
lexical['Cuvinte Totale'] = dtm.sum(axis=1)
lexical.head(2)

In [None]:
reviews['text_prep_lim'].iloc[1]

In [None]:
lexical.describe()

In [None]:
lexical['positive'] = reviews['positive'].copy()
lexical.head(2)

In [None]:
lexical.groupby('positive').mean()

# Densitate Lexicala: nr. de cuvinte unice / totale

In [None]:
lexical['densitate'] = lexical['Cuvinte Unice'] / lexical['Cuvinte Totale']
lexical.head(2)

In [None]:
lexical.groupby('positive').mean()