# ETL Reviews   
Importamos las librerias que vamos a utilizar, en este caso pandas u nltk para realizar el analisis de sentimiento.

In [1]:
import pandas as pd
import nltk 
from nltk.sentiment.vader import SentimentIntensityAnalyzer


In [3]:
# Leemos el csv creado en el EDA y nos fijamos si nos trae bien los datos
reviews = pd.read_csv('user_reviews.csv')
reviews.head()

Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,review
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted November 5, 2011.",,1250.0,No ratings yet,True,Simple yet with great replayability. In my opi...
1,js41637,http://steamcommunity.com/id/js41637,,"Posted July 15, 2011.",,22200.0,No ratings yet,True,It's unique and worth a playthrough.
2,evcentric,http://steamcommunity.com/id/evcentric,,"Posted April 21, 2011.",,43110.0,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...
3,doctr,http://steamcommunity.com/id/doctr,,"Posted June 24, 2014.",,251610.0,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
4,maplemage,http://steamcommunity.com/id/maplemage,,"Posted September 8, 2013.",,227300.0,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...


In [4]:
# Pedimos un info para ver que datos nos encontramos y si hay nulos.
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25800 entries, 0 to 25799
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   user_id      25800 non-null  object 
 1   user_url     25799 non-null  object 
 2   funny        4115 non-null   object 
 3   posted       25799 non-null  object 
 4   last_edited  2997 non-null   object 
 5   item_id      25799 non-null  float64
 6   helpful      25799 non-null  object 
 7   recommend    25799 non-null  object 
 8   review       25787 non-null  object 
dtypes: float64(1), object(8)
memory usage: 1.8+ MB


In [5]:
# Antes de realizar el analisis de sentimiento vamos a eliminar algunas columnas que no vamos a tener en cuenta
reviews = reviews.drop(columns=['user_url', 'funny', 'last_edited', 'helpful'])

In [6]:
# Vemos como queda el data frame
reviews.head()

Unnamed: 0,user_id,posted,item_id,recommend,review
0,76561197970982479,"Posted November 5, 2011.",1250.0,True,Simple yet with great replayability. In my opi...
1,js41637,"Posted July 15, 2011.",22200.0,True,It's unique and worth a playthrough.
2,evcentric,"Posted April 21, 2011.",43110.0,True,Great atmosphere. The gunplay can be a bit chu...
3,doctr,"Posted June 24, 2014.",251610.0,True,I know what you think when you see this title ...
4,maplemage,"Posted September 8, 2013.",227300.0,True,For a simple (it's actually not all that simpl...


In [7]:
# Iniciamos el analisis de sentimiento. Primero desscargamos ultima version de nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/brunomangione/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [8]:
# Crear un objeto SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

In [9]:
# Función para asignar polaridades
def assign_sentiment(text):
    if not isinstance(text, str):
        return 1  # Neutral para valores no string

    # Obtener la puntuación de sentimiento
    sentiment = sia.polarity_scores(text)
    
    # Clasificar el sentimiento basado en la puntuación compuesta
    compound_score = sentiment['compound']
    if compound_score > 0.05:
        return 2  # Positivo
    elif compound_score < -0.05:
        return 0  # Negativo

    return 1  # Neutral

In [10]:
# Aplicamos la funcion de analisis de sentimiento a la columna reviews
reviews['sentiment_analysis'] = reviews['review'].apply(assign_sentiment)

In [11]:
# Vemos como queda nuestro dataframe
reviews.head()  

Unnamed: 0,user_id,posted,item_id,recommend,review,sentiment_analysis
0,76561197970982479,"Posted November 5, 2011.",1250.0,True,Simple yet with great replayability. In my opi...,2
1,js41637,"Posted July 15, 2011.",22200.0,True,It's unique and worth a playthrough.,2
2,evcentric,"Posted April 21, 2011.",43110.0,True,Great atmosphere. The gunplay can be a bit chu...,2
3,doctr,"Posted June 24, 2014.",251610.0,True,I know what you think when you see this title ...,2
4,maplemage,"Posted September 8, 2013.",227300.0,True,For a simple (it's actually not all that simpl...,2


In [12]:
# Necesitamos extraer el año de la columna posted para luego trabajar con ello, por lo tanto creamos la columna year y extraemos el año.
reviews['year'] = reviews['posted'].str.extract(r'(\d{4})')

In [13]:
reviews.head()

Unnamed: 0,user_id,posted,item_id,recommend,review,sentiment_analysis,year
0,76561197970982479,"Posted November 5, 2011.",1250.0,True,Simple yet with great replayability. In my opi...,2,2011
1,js41637,"Posted July 15, 2011.",22200.0,True,It's unique and worth a playthrough.,2,2011
2,evcentric,"Posted April 21, 2011.",43110.0,True,Great atmosphere. The gunplay can be a bit chu...,2,2011
3,doctr,"Posted June 24, 2014.",251610.0,True,I know what you think when you see this title ...,2,2014
4,maplemage,"Posted September 8, 2013.",227300.0,True,For a simple (it's actually not all that simpl...,2,2013


In [14]:
# Ahora eliminamos la columna posted
reviews = reviews.drop(columns='posted')

In [15]:
reviews.head()

Unnamed: 0,user_id,item_id,recommend,review,sentiment_analysis,year
0,76561197970982479,1250.0,True,Simple yet with great replayability. In my opi...,2,2011
1,js41637,22200.0,True,It's unique and worth a playthrough.,2,2011
2,evcentric,43110.0,True,Great atmosphere. The gunplay can be a bit chu...,2,2011
3,doctr,251610.0,True,I know what you think when you see this title ...,2,2014
4,maplemage,227300.0,True,For a simple (it's actually not all that simpl...,2,2013


In [16]:
# Pedimos un info para ver los datos como estan
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25800 entries, 0 to 25799
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   user_id             25800 non-null  object 
 1   item_id             25799 non-null  float64
 2   recommend           25799 non-null  object 
 3   review              25787 non-null  object 
 4   sentiment_analysis  25800 non-null  int64  
 5   year                20366 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 1.2+ MB


In [20]:
# Aplicamos dropna para eliminar archivos nulos.
reviews = reviews.dropna()

In [21]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20356 entries, 0 to 25799
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   user_id             20356 non-null  object 
 1   item_id             20356 non-null  float64
 2   recommend           20356 non-null  object 
 3   review              20356 non-null  object 
 4   sentiment_analysis  20356 non-null  int64  
 5   year                20356 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 1.1+ MB


In [22]:
# Ya con el dataframe limpio exportamos el csv final para trabajar.
reviews.to_csv('reviews_final.csv', index=False)