<a href="https://colab.research.google.com/github/carohdez/PhD-RS_EFM/blob/main/ABSA_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Script to get aspect and sentiment per sentence.
# Get user frequencies and item quality matrices, to run EFM (restaurants and hotels)"
# Aspects are detected based on a BERT classifier. Text classification algorithm is based on examples from  Chris McCormick (ChrisMcCormickAI)

# load dataset
domain='hotels'
if domain =='restaurant':
  aspects={'ambience':0, 'anecdotes/miscellaneous':1 , 'food':2, 'price':3, 'service':4}
else:
  #aspects={'bathroom':0,'cleanliness':1,'comfort':2,'food':3,'location':4,'facilities':5,'price':6,'room':7,'staff':8,'internet':9,'reservation':10,'ambiance':11}
  #aspects={'facilities':0, 'staff':1, 'reservation':2, 'room':3, 'bathroom':4, 'location':5, 'price':6, 'ambience':7, 'food':8, 'comfort':9, 'cleanliness':10, 'checking':11}
  #aspects={'facilities':0, 'staff':1, 'room':2, 'bathroom':3, 'location':4, 'price':5, 'ambience':6, 'food':7, 'comfort':8, 'cleanliness':9, 'checking':10}
  aspects={'facilities':0, 'staff':1, 'room':2, 'bathroom':3, 'location':4, 'price':5, 'ambience':6, 'food':7, 'comfort':8, 'checking':9} # cleanliness out



In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


In [None]:
!pip install transformers

In [None]:
from transformers import BertTokenizer

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Loading BERT tokenizer...


In [None]:
# load models
from transformers import BertForSequenceClassification, AdamW, BertConfig

# Load model from disk: Aspect extraction
if domain=='restaurant':
  output_dir = '/content/drive/My Drive/models/Restaurant/AspectExtraction'
  model_ae = BertForSequenceClassification.from_pretrained(output_dir)
else:
  output_dir = '/content/drive/My Drive/models/ABSA_ArguAna/AspectExtraction'
  model_ae = BertForSequenceClassification.from_pretrained(output_dir)

# Load tokenizer
tokenizer_ae = BertTokenizer.from_pretrained(output_dir)
model_ae.to(device)


# Load model from disk: Sentiment analysis
output_dir = '/content/drive/My Drive/models/ABSA_ArguAna/SentimentAnalysis'
model_sa = BertForSequenceClassification.from_pretrained(output_dir)

# Load tokenizer
tokenizer_sa = BertTokenizer.from_pretrained(output_dir)
model_sa.to(device)


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [None]:
!python /content/drive/My\ Drive/ABSA/code/AC_detection.py
!cat /content/drive/My\ Drive/ABSA/code/AC_detection.py

In [None]:
!python /content/drive/My\ Drive/ABSA/code/SA_detection.py
!cat /content/drive/My\ Drive/ABSA/code/SA_detection.py

In [None]:
import sys
sys.path.append('/content/drive/My Drive/ABSA/code/')

In [None]:
import AC_detection
import SA_detection

AC_detection.tokenizer=tokenizer_ae
AC_detection.model=model_ae
AC_detection.torch=torch
AC_detection.device=device
AC_detection.aspects=aspects

SA_detection.tokenizer=tokenizer_sa
SA_detection.model=model_sa
SA_detection.torch=torch
SA_detection.device=device


Using TensorFlow backend.


In [None]:
#sentence="she talked to me in a very rude manner"
#sentence="toilet was ok, the girl in reception was nice"
#sentence="it is very close to main attractions of the city, lots shops around"
#sentence="Nice Room, Good location..."
sentence="'Nice shower, lots of tv channels, even movies are free of charge, wifi, nice shower gels"
#sentence="Rooms are typical dutch small"

sentence= "We also expected to see a more state of the art museum at Check point Charlie."
categories=AC_detection.get_category(sentence)
print(categories)
sentiment=SA_detection.get_sentiment(sentence)
print(sentiment)

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


ValueError: ignored

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
text = "Clean, cool, Dutch design at an affordable design, with a great location. Pretty good balance of design and value located within easy walking distance to Rembrandtplein. Rooms are typical Dutch small, but very well designed with a cozy lobby area for gathering with friends in front of the mod fire pit and comfy sofa. If you like cool, clean design, you'll like it here.  "

print(sent_tokenize(text))
for i in sent_tokenize(text):
  print(i)
  categories=AC_detection.get_category(i)
  print(categories)
  sentiment=SA_detection.get_sentiment(i)
  print(sentiment)

In [None]:
# load data 
import pandas as pd 

if domain=='restaurant':
  #df=pd.read_csv('/content/drive/My Drive/ABSA/data/Yelp/dataset_20_in_Phoenix.csv', sep= '\t', engine='python')
  df=pd.read_csv('/content/drive/My Drive/ABSA/data/Yelp/dataset_restaurants_10_in_Phoenix.csv', sep= '\t', engine='python')
  ratings=pd.read_csv('/content/drive/My Drive/ABSA/data/Yelp/ratings_Yelp_index.csv', sep='\,', engine='python')
  ratings.set_index('user_id', inplace=True)
else:
  df=pd.read_csv('/content/drive/My Drive/ABSA/data/ArguAna_Unannotated_authors5.csv', sep='\t', engine='python')
  ratings=pd.read_csv('/content/drive/My Drive/ABSA/data/ratings_authors_5_index.csv', sep='\,', engine='python')
  ratings.set_index('author', inplace=True)

In [None]:
df

Unnamed: 0.1,Unnamed: 0,hotelID,score,review_text,author
0,72,1126079,5.0,Great aiport hotel! Decided to stay at this ai...,Arturas
1,106,1126079,5.0,"Super mod, this concept is a winner We loved t...",mattkorey
2,151,189387,4.0,"Nice Room, Good location...Tight front desk Th...",Escaramujo
3,266,189387,4.0,Quirky hotel with wonderful staff We loved bei...,theresama40
4,296,189387,5.0,If only I loved Amsterdam as much as the Pulit...,Tampa_Law_Guy
...,...,...,...,...,...
5342,196506,571526,5.0,Great little hotel near Piazza San Marco I sta...,minkKC
5343,196519,571526,4.0,Great Find Ca Dei Dogi makes you feel like you...,PlateMan
5344,196544,577645,5.0,Fantastic After finding this place on TripAdvi...,bax52
5345,196611,577645,5.0,Echoing the other reviews My family and I (2 a...,emiao


In [None]:
# Get the user feature attention matrix X and item quality matrix Y

import numpy as np
if domain=='restaurant': n_aspects=5
else: n_aspects=12

authors_freq=np.zeros(shape=(ratings.shape[0], n_aspects))
authors_freq=np.column_stack([authors_freq, np.array(list(ratings.index)).T])
authors_freq=pd.DataFrame(authors_freq, columns=[i for i in range(0,n_aspects+1)]).set_index(n_aspects, drop=True)

if domain=='restaurant':
  authors_freq.rename(columns={0:'ambience', 1:'anecdotes/miscellaneous' , 2:'food', 3:'price', 4:'service'}, inplace=True)
else:
  authors_freq.rename(columns={0:'bathroom',1:'cleanliness',2:'comfort',3:'food',4:'location',5:'facilities',6:'price',7:'room',8:'staff',9:'internet',10:'reservation',11:'ambiance'}, inplace=True)

business_pos=np.zeros(shape=(ratings.shape[1], n_aspects))
business_pos=np.column_stack([business_pos, np.array(list(ratings.columns)).T])
business_pos=pd.DataFrame(business_pos, columns=[i for i in range(0,n_aspects+1)]).set_index(n_aspects, drop=True)

if domain=='restaurant':
  business_pos.rename(columns={0:'ambience', 1:'anecdotes/miscellaneous' , 2:'food', 3:'price', 4:'service'}, inplace=True)
else:
  business_pos.rename(columns={0:'bathroom',1:'cleanliness',2:'comfort',3:'food',4:'location',5:'facilities',6:'price',7:'room',8:'staff',9:'internet',10:'reservation',11:'ambiance'}, inplace=True)

business_neg=business_pos.copy()

if domain=='restaurant':
  author_id_col='user_id'
  business_id_col='business_id'
  review_id_col='text'
else: 
  author_col='author'
  business_id_col='hotelID'
  review_id_col='review_text'

In [None]:
count=0
for user in list(authors_freq.index):
  count+=1
  if (count >=3300):
    if count%100==0:
      print(count)
      authors_freq.to_csv('/content/drive/My Drive/ABSA/data/authors_Yelp.csv')
      business_pos.to_csv('/content/drive/My Drive/ABSA/data/yelp_pos.csv')
      business_neg.to_csv('/content/drive/My Drive/ABSA/data/yelp_neg.csv')
    reviews_user=df[df[author_id_col]==user]
    for index, row in reviews_user.iterrows():
      review=row[review_id_col]
      businessID=str(row[business_id_col])
      for i in sent_tokenize(review):
        sentiment =SA_detection.get_sentiment(i)
        if sentiment != 'neutral':
          category=AC_detection.get_category(i)
          
          if len(category)>0:
            authors_freq.loc[user,category] = float(authors_freq.loc[user,category[0]])+1
            if sentiment=='positive':
              business_pos.loc[businessID,category] = float(business_pos.loc[businessID,category[0]])+1
            else:
              business_neg.loc[businessID,category] = float(business_neg.loc[businessID,category[0]])-1
  # else:
  #   break
authors_freq.to_csv('/content/drive/My Drive/ABSA/data/authors_Yelp.csv')
business_pos.to_csv('/content/drive/My Drive/ABSA/data/yelp_pos.csv')
business_neg.to_csv('/content/drive/My Drive/ABSA/data/yelp_neg.csv')

In [None]:
import logging
logging.basicConfig(level=logging.ERROR)

In [None]:
# Load file of aspects - subaspects
fc = pd.read_csv('/content/drive/My Drive/ABSA/data/feature-category.csv', sep=';', header=0)
fc.columns
features = fc[fc.category=='room']


In [None]:
# Create file sentence level, aspect and subaspects ----------------------
df.rename(columns={'Unnamed: 0': 'reviewID'}, inplace=True)
df_sent=pd.DataFrame(columns=['hotelID','reviewID','author','score','sentence','aspects','features'])
n=0
for index, row in df.iterrows():
  n += 1
  if n%100==0: print(n)
  #print(row.review_text)
  for sentence in sent_tokenize(row.review_text):
    features = ''
    aspects = ''
    sentiment =SA_detection.get_sentiment(sentence)
    #if sentiment != 'neutral':
    categories=AC_detection.get_category(sentence)
    for category in categories:
      aspects = aspects + category + ','
    for word in sentence.lower().split():
      if word in fc.feature.values:
        features = features + word + ','
    if len(aspects)>0: aspects = aspects[0:-1]
    if len(features)>0: features = features[0:-1]
    df_sent=df_sent.append(pd.DataFrame({'hotelID':[row.hotelID],'reviewID':[row.reviewID],'author':[row.author],'score':[row.score],'sentence':[sentence],'polarity':[sentiment],'aspects':[aspects],'features':[features]}))
    
df_sent.to_csv('/content/drive/My Drive/ABSA/data/ArguAna_sentence_level.csv', sep=';')


100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300


In [None]:
#df.columns
df_sent


Unnamed: 0,hotelID,reviewID,sentence,aspects,features,polarity
0,1126079,72,Great aiport hotel!,"facilities,location",,
0,1126079,72,Decided to stay at this airport hotel because ...,"location,facilities",stay,
0,1126079,72,I could take taxi from Amsterdam but I thought...,price,"taxi,pay,stay",
0,1126079,72,I really enjoyed the stay at this hotel.,facilities,stay,
0,1126079,72,The room is small but enough space to put you ...,room,"room,small,space",
0,1126079,72,The bed is very comfy and it is hard to believ...,room,"bed,location",
0,1126079,72,"Nice shower, lots of tv channels, even movies ...",internet,"tv,shower,stay",
0,1126079,72,The price of the room is very tempting and if ...,price,"price,room,stay,in",
0,1126079,72,With the train is very easy to reach Amsterdam,"location,facilities",,
0,1126079,72,Great aiport hotel!,"facilities,location",,positive
