In [1]:
import pandas as pd
import re
import nltk
import pyarrow as pa
import pyarrow.parquet as pq
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Procesamiento de Lenguaje Natural - Reviews

## Carga de archivo

In [2]:
reviews = pd.read_parquet('reviews.parquet')

In [3]:
reviews

Unnamed: 0,user_id,item_id,recommend,review,item_name
0,76561197970982479,1250.0,True,Simple yet with great replayability. In my opi...,Killing Floor
1,76561197970982479,22200.0,True,It's unique and worth a playthrough.,Zeno Clash
2,76561197970982479,43110.0,True,Great atmosphere. The gunplay can be a bit chu...,Metro 2033
3,js41637,251610.0,True,I know what you think when you see this title ...,Barbie™ Dreamhouse Party™
4,js41637,227300.0,True,For a simple (it's actually not all that simpl...,Euro Truck Simulator 2
...,...,...,...,...,...
58426,76561198312638244,70.0,True,a must have classic from steam definitely wort...,Half-Life
58427,76561198312638244,362890.0,True,this game is a perfect remake of the original ...,Black Mesa
58428,LydiaMorley,273110.0,True,had so much fun plaing this and collecting res...,Counter-Strike Nexon: Zombies
58429,LydiaMorley,730.0,True,:D,Counter-Strike: Global Offensive


In [4]:
reviews_nlp = reviews.copy()

In [5]:
reviews_nlp['review']

0        Simple yet with great replayability. In my opi...
1                     It's unique and worth a playthrough.
2        Great atmosphere. The gunplay can be a bit chu...
3        I know what you think when you see this title ...
4        For a simple (it's actually not all that simpl...
                               ...                        
58426    a must have classic from steam definitely wort...
58427    this game is a perfect remake of the original ...
58428    had so much fun plaing this and collecting res...
58429                                                   :D
58430                                       so much fun :D
Name: review, Length: 58431, dtype: object

## Oraciones

### Normalizacion y Tokenizacion

In [6]:
reviews_nlp['sent_norm'] = reviews_nlp['review'].apply(lambda x: re.sub(r"[^0-9a-zA-Z!:.']", " ", str(x)))
reviews_nlp['sent_norm']

0        Simple yet with great replayability. In my opi...
1                     It's unique and worth a playthrough.
2        Great atmosphere. The gunplay can be a bit chu...
3        I know what you think when you see this title ...
4        For a simple  it's actually not all that simpl...
                               ...                        
58426    a must have classic from steam definitely wort...
58427    this game is a perfect remake of the original ...
58428    had so much fun plaing this and collecting res...
58429                                                   :D
58430                                       so much fun :D
Name: sent_norm, Length: 58431, dtype: object

In [7]:
reviews_nlp['sent_tok'] = reviews_nlp['sent_norm'].apply(lambda x: nltk.tokenize.sent_tokenize(str(x), language='english'))
reviews_nlp['sent_tok']

0        [Simple yet with great replayability., In my o...
1                   [It's unique and worth a playthrough.]
2        [Great atmosphere., The gunplay can be a bit c...
3        [I know what you think when you see this title...
4        [For a simple  it's actually not all that simp...
                               ...                        
58426    [a must have classic from steam definitely wor...
58427    [this game is a perfect remake of the original...
58428    [had so much fun plaing this and collecting re...
58429                                                 [:D]
58430                                     [so much fun :D]
Name: sent_tok, Length: 58431, dtype: object

### Modelo de Sentiment Analysis

In [8]:
def eval (col):
    sid = SentimentIntensityAnalyzer()    
    eval_tot = []
    for sentences in col:
        val = []
        for sentence in sentences:
            ss = sid.polarity_scores(sentence)
            val.append(ss)
        eval_tot.append(val)
    return eval_tot

In [9]:
eval_tot = eval(reviews_nlp['sent_tok'])
eval_tot

[[{'neg': 0.0, 'neu': 0.494, 'pos': 0.506, 'compound': 0.6249},
  {'neg': 0.185, 'neu': 0.69, 'pos': 0.125, 'compound': -0.34},
  {'neg': 0.0, 'neu': 0.769, 'pos': 0.231, 'compound': 0.5106},
  {'neg': 0.0, 'neu': 0.686, 'pos': 0.314, 'compound': 0.4939}],
 [{'neg': 0.0, 'neu': 0.678, 'pos': 0.322, 'compound': 0.2263}],
 [{'neg': 0.0, 'neu': 0.196, 'pos': 0.804, 'compound': 0.6249},
  {'neg': 0.0, 'neu': 0.743, 'pos': 0.257, 'compound': 0.8762}],
 [{'neg': 0.0, 'neu': 0.749, 'pos': 0.251, 'compound': 0.794},
  {'neg': 0.0, 'neu': 0.873, 'pos': 0.127, 'compound': 0.4404},
  {'neg': 0.121, 'neu': 0.705, 'pos': 0.175, 'compound': 0.25}],
 [{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0},
  {'neg': 0.0, 'neu': 0.531, 'pos': 0.469, 'compound': 0.7939},
  {'neg': 0.0, 'neu': 0.87, 'pos': 0.13, 'compound': 0.4215},
  {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0},
  {'neg': 0.213, 'neu': 0.703, 'pos': 0.084, 'compound': -0.5106},
  {'neg': 0.0, 'neu': 0.615, 'pos': 0.385, 'compou

In [10]:
eva = pd.DataFrame({'eval':eval_tot})
eva

Unnamed: 0,eval
0,"[{'neg': 0.0, 'neu': 0.494, 'pos': 0.506, 'com..."
1,"[{'neg': 0.0, 'neu': 0.678, 'pos': 0.322, 'com..."
2,"[{'neg': 0.0, 'neu': 0.196, 'pos': 0.804, 'com..."
3,"[{'neg': 0.0, 'neu': 0.749, 'pos': 0.251, 'com..."
4,"[{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compoun..."
...,...
58426,"[{'neg': 0.0, 'neu': 0.566, 'pos': 0.434, 'com..."
58427,"[{'neg': 0.0, 'neu': 0.571, 'pos': 0.429, 'com..."
58428,"[{'neg': 0.141, 'neu': 0.502, 'pos': 0.357, 'c..."
58429,"[{'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compoun..."


In [11]:
reviews_nlp=reviews_nlp.join(eva)
reviews_nlp = reviews_nlp.explode('eval')

In [12]:
reviews_nlp.head()

Unnamed: 0,user_id,item_id,recommend,review,item_name,sent_norm,sent_tok,eval
0,76561197970982479,1250.0,True,Simple yet with great replayability. In my opi...,Killing Floor,Simple yet with great replayability. In my opi...,"[Simple yet with great replayability., In my o...","{'neg': 0.0, 'neu': 0.494, 'pos': 0.506, 'comp..."
0,76561197970982479,1250.0,True,Simple yet with great replayability. In my opi...,Killing Floor,Simple yet with great replayability. In my opi...,"[Simple yet with great replayability., In my o...","{'neg': 0.185, 'neu': 0.69, 'pos': 0.125, 'com..."
0,76561197970982479,1250.0,True,Simple yet with great replayability. In my opi...,Killing Floor,Simple yet with great replayability. In my opi...,"[Simple yet with great replayability., In my o...","{'neg': 0.0, 'neu': 0.769, 'pos': 0.231, 'comp..."
0,76561197970982479,1250.0,True,Simple yet with great replayability. In my opi...,Killing Floor,Simple yet with great replayability. In my opi...,"[Simple yet with great replayability., In my o...","{'neg': 0.0, 'neu': 0.686, 'pos': 0.314, 'comp..."
1,76561197970982479,22200.0,True,It's unique and worth a playthrough.,Zeno Clash,It's unique and worth a playthrough.,[It's unique and worth a playthrough.],"{'neg': 0.0, 'neu': 0.678, 'pos': 0.322, 'comp..."


In [13]:
eval_expanded = pd.json_normalize(reviews_nlp['eval']).set_index(reviews_nlp.index)
eval_expanded.head()

Unnamed: 0,neg,neu,pos,compound
0,0.0,0.494,0.506,0.6249
0,0.185,0.69,0.125,-0.34
0,0.0,0.769,0.231,0.5106
0,0.0,0.686,0.314,0.4939
1,0.0,0.678,0.322,0.2263


In [14]:
reviews_nlp=reviews_nlp.join(eval_expanded)
reviews_nlp.drop(columns=['compound','eval','sent_norm', 'sent_tok'],inplace=True)
reviews_nlp.head()

Unnamed: 0,user_id,item_id,recommend,review,item_name,neg,neu,pos
0,76561197970982479,1250.0,True,Simple yet with great replayability. In my opi...,Killing Floor,0.0,0.494,0.506
0,76561197970982479,1250.0,True,Simple yet with great replayability. In my opi...,Killing Floor,0.185,0.69,0.125
0,76561197970982479,1250.0,True,Simple yet with great replayability. In my opi...,Killing Floor,0.0,0.769,0.231
0,76561197970982479,1250.0,True,Simple yet with great replayability. In my opi...,Killing Floor,0.0,0.686,0.314
0,76561197970982479,1250.0,True,Simple yet with great replayability. In my opi...,Killing Floor,0.0,0.494,0.506


In [15]:
reviews_nlp['original_index'] = reviews_nlp.index
reviews_nlp = reviews_nlp.groupby(['user_id', 'item_id', 'recommend', 'review','item_name']).agg({'neg': 'sum','neu': 'sum','pos': 'sum','original_index':'min'}).reset_index()
reviews_nlp.head()

Unnamed: 0,user_id,item_id,recommend,review,item_name,neg,neu,pos,original_index
0,--000--,1250.0,True,หนุกคับ แนะนำ 10/10,Killing Floor,0.0,1.0,0.0,20667
1,--ace--,440.0,True,the best game i ever plllayed,Team Fortress 2,0.0,0.488,0.512,57334
2,--ace--,113200.0,True,One Of The Funnyest Games That Is Animated :) ...,The Binding of Isaac,0.0,0.744,0.256,57335
3,--ionex--,730.0,True,"it done brokeded on me, the game no longer wor...",Counter-Strike: Global Offensive,0.182,3.4,0.418,31482
4,--ionex--,105600.0,True,"It's an amazing game, and im glad that mac use...",Terraria,0.0,0.794,0.206,31483


In [16]:
reviews_nlp=reviews_nlp.set_index('original_index')
reviews_nlp.index.name = None
reviews_nlp=reviews_nlp.sort_index()
reviews_nlp.head()

Unnamed: 0,user_id,item_id,recommend,review,item_name,neg,neu,pos
0,76561197970982479,1250.0,True,Simple yet with great replayability. In my opi...,Killing Floor,0.74,10.556,4.704
1,76561197970982479,22200.0,True,It's unique and worth a playthrough.,Zeno Clash,0.0,0.678,0.322
2,76561197970982479,43110.0,True,Great atmosphere. The gunplay can be a bit chu...,Metro 2033,0.0,1.878,2.122
3,js41637,251610.0,True,I know what you think when you see this title ...,Barbie™ Dreamhouse Party™,0.363,6.981,1.659
4,js41637,227300.0,True,For a simple (it's actually not all that simpl...,Euro Truck Simulator 2,1.278,28.314,6.408


In [17]:
def sent_a(neg,neu,pos):
    if max(neg,neu,pos)==neg:
        return 0
    elif max(neg,neu,pos)==neu:
        return 1
    else:
        return 2

In [18]:
reviews_nlp['sentiment_analysis'] = reviews_nlp[['neg', 'neu', 'pos']].apply(lambda row: sent_a(row['neg'], row['neu'], row['pos']), axis=1)
reviews_nlp.head()

Unnamed: 0,user_id,item_id,recommend,review,item_name,neg,neu,pos,sentiment_analysis
0,76561197970982479,1250.0,True,Simple yet with great replayability. In my opi...,Killing Floor,0.74,10.556,4.704,1
1,76561197970982479,22200.0,True,It's unique and worth a playthrough.,Zeno Clash,0.0,0.678,0.322,1
2,76561197970982479,43110.0,True,Great atmosphere. The gunplay can be a bit chu...,Metro 2033,0.0,1.878,2.122,2
3,js41637,251610.0,True,I know what you think when you see this title ...,Barbie™ Dreamhouse Party™,0.363,6.981,1.659,1
4,js41637,227300.0,True,For a simple (it's actually not all that simpl...,Euro Truck Simulator 2,1.278,28.314,6.408,1


## Palabras

### Normalizaccion y Tokenizacion

In [19]:
reviews_nlp['word_norm'] = reviews_nlp['review'].apply(lambda x: re.sub(r"[^a-zA-Z']", " ", str(x)).lower())
reviews_nlp['word_norm']

0        simple yet with great replayability  in my opi...
1                     it's unique and worth a playthrough 
2        great atmosphere  the gunplay can be a bit chu...
3        i know what you think when you see this title ...
4        for a simple  it's actually not all that simpl...
                               ...                        
58426    a must have classic from steam definitely wort...
58427    this game is a perfect remake of the original ...
58428    had so much fun plaing this and collecting res...
58429                                                    d
58430                                       so much fun  d
Name: word_norm, Length: 58197, dtype: object

In [20]:
stopwords = nltk.corpus.stopwords.words('english')
stopwords = [palabra for palabra in stopwords if 'not' not in palabra]
stopwords.append("'s")
stopwords.append("n't")
stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [21]:
reviews_nlp['word_tok'] = reviews_nlp['word_norm'].apply(lambda x: nltk.tokenize.word_tokenize(str(x), language='english'))
reviews_nlp['word_count'] = reviews_nlp['word_tok'].apply(len)
reviews_nlp['word_tok']=reviews_nlp['word_tok'].apply(lambda x: [word for word in x if word not in stopwords])
reviews_nlp['word_tok']

0        [simple, yet, great, replayability, opinion, z...
1                             [unique, worth, playthrough]
2        [great, atmosphere, gunplay, bit, chunky, time...
3        [know, think, see, title, barbie, dreamhouse, ...
4        [simple, actually, not, simple, truck, driving...
                               ...                        
58426    [must, classic, steam, definitely, worth, buying]
58427    [game, perfect, remake, original, half, life, ...
58428    [much, fun, plaing, collecting, resources, xd,...
58429                                                   []
58430                                          [much, fun]
Name: word_tok, Length: 58197, dtype: object

## Version Final 

In [22]:
reviews_nlp.drop(columns=['neg','neu','pos','word_norm','review'],inplace=True)

In [23]:
reviews_nlp.rename(columns={'word_tok':'words'},inplace=True)

In [24]:
reviews_nlp

Unnamed: 0,user_id,item_id,recommend,item_name,sentiment_analysis,words,word_count
0,76561197970982479,1250.0,True,Killing Floor,1,"[simple, yet, great, replayability, opinion, z...",44
1,76561197970982479,22200.0,True,Zeno Clash,1,"[unique, worth, playthrough]",7
2,76561197970982479,43110.0,True,Metro 2033,2,"[great, atmosphere, gunplay, bit, chunky, time...",40
3,js41637,251610.0,True,Barbie™ Dreamhouse Party™,1,"[know, think, see, title, barbie, dreamhouse, ...",112
4,js41637,227300.0,True,Euro Truck Simulator 2,1,"[simple, actually, not, simple, truck, driving...",117
...,...,...,...,...,...,...,...
58426,76561198312638244,70.0,True,Half-Life,1,"[must, classic, steam, definitely, worth, buying]",9
58427,76561198312638244,362890.0,True,Black Mesa,1,"[game, perfect, remake, original, half, life, ...",92
58428,LydiaMorley,273110.0,True,Counter-Strike Nexon: Zombies,1,"[much, fun, plaing, collecting, resources, xd,...",20
58429,LydiaMorley,730.0,True,Counter-Strike: Global Offensive,2,[],1


## Exportacion de Archivo

In [25]:
pq.write_table(pa.Table.from_pandas(reviews_nlp),'reviews_nlp.parquet')