In [None]:
!pip install pymorphy2
!pip install -U 'scikit-learn<0.24'
!pip install stanza
!pip install sklearn-crfsuite

In [None]:
import pandas as pd
import re
import pickle
import seaborn as sns
from nltk.tokenize import RegexpTokenizer
from pymorphy2 import MorphAnalyzer
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
import scipy
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.model_selection import RandomizedSearchCV
from sklearn import preprocessing
from collections import defaultdict
import stanza
stanza.download('ru')
morph = MorphAnalyzer()
token = RegexpTokenizer('\w+')

# Задача 1

In [27]:
nlp = stanza.Pipeline('ru', processors='tokenize')

2021-12-28 11:08:21 INFO: Loading these models for language: ru (Russian):
| Processor | Package   |
-------------------------
| tokenize  | syntagrus |

2021-12-28 11:08:21 INFO: Use device: cpu
2021-12-28 11:08:21 INFO: Loading: tokenize
2021-12-28 11:08:22 INFO: Done loading processors!


In [None]:
!wget https://github.com/named-entity/hse-nlp/raw/master/4th_year/Project/dev_aspects.txt
!wget https://github.com/named-entity/hse-nlp/raw/master/4th_year/Project/dev_reviews.txt
!wget https://github.com/named-entity/hse-nlp/raw/master/4th_year/Project/dev_cats.txt

In [16]:
crf = pickle.load(open('task1.sav', 'rb'))

In [14]:
def get_bio(reviews):
  bio_texts = []
  for text_id, text in tqdm(reviews.items()):
    bio_text = []
    processed = nlp(text)
    postags = []
    for token in processed.iter_tokens():
      add = False
      parse = morph.parse(token.text)[0]
      postag = parse.tag.POS
      if postag is None:
        postag = 'PUNKT'
      postags.append(postag)
      for mention in aspects[text_id]:
        if token.start_char == int(mention['start']) and token.end_char <= int(mention['end']):
          bio_text.append({'text':token.text, 'pos': postag, 'tag':'B-'+mention['category'],
                          'text_id': text_id, 'start': token.start_char, 'end': token.end_char})
          add = True
        elif token.start_char > int(mention['start']) and token.end_char <= int(mention['end']):
          bio_text.append({'text':token.text, 'pos': postag, 'tag':'I-'+mention['category'],
                          'text_id': text_id, 'start': token.start_char, 'end': token.end_char})
          add = True
      if not add:
        bio_text.append({'text':token.text, 'pos': postag, 'tag':'O', 
                        'text_id': text_id, 'start': token.start_char, 
                        'end': token.end_char})
    bio_texts.append(bio_text)

  return bio_texts

In [29]:
def word2features(sent, i):
    # достаёт фичи для i-го токена в предложении
    word = sent[i]['text']
    postag = sent[i]['pos']
    text_id = sent[i]['text_id']
    start = sent[i]['start']
    end = sent[i]['end']

    features = {
        'text_id': text_id, 
        'word': word,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'start': start,
        'end': end
    }
    if i > 0:
        word1 = sent[i-1]['text']
        postag1 = sent[i-1]['pos']
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
        })
    else:
        features['BOS'] = True
        
    if i < len(sent)-1:
        word1 = sent[i+1]['text']
        postag1 = sent[i+1]['pos']
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
                
    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [word['tag'] for word in sent]

def sent2tokens(sent):
    return [word['text'] for word in sent]

In [30]:
def get_marking(y_pred, X_test):
  d = {'text_id': [], 'tag': [], 'text': [], 'start': [], 'end': []}
  for num, i in enumerate(y_pred):
    for j, tag in enumerate(i):
      if tag!= 'O':
        if i[j].startswith('B') and i[j+1].startswith('I'):
          n=j+1
          tokens = []
          tokens.append(X_test[num][j]['word'])
          start = X_test[num][j]['start']
          while i[n].startswith('I'):
            tokens.append(X_test[num][n]['word'])
            n+=1
          end = X_test[num][n-1]['end']
          t_id = X_test[num][j]['text_id']
          tag = i[j].split('-')[1]
          d['text_id'].append(t_id)
          d['text'].append(' '.join(tokens))
          d['tag'].append(tag)
          d['start'].append(start)
          d['end'].append(end)
        elif i[j].startswith('B') and not i[j+1].startswith('I'):
          d['text_id'].append(X_test[num][j]['text_id'])
          d['tag'].append(i[j].split('-')[1])
          d['text'].append(X_test[num][j]['word'])
          d['start'].append(X_test[num][j]['start'])
          d['end'].append(X_test[num][j]['end'])

  df = pd.DataFrame(data=d)
  return df

In [31]:
dev_texts = pd.read_csv('dev_reviews.txt', delimiter='\t', names=['text_id', 'text'])
dev_texts.head()

Unnamed: 0,text_id,text
0,13823,"Зашли в""аппетит"" случайно.Не смотря на то,что ..."
1,1427,Здравствуйте!Посетили ваше заведение вчера пер...
2,16714,"Были в пятницу (19.03.10), заказывали столик д..."
3,797,"Были в ресторане 2 раза. Один раз днем, все по..."
4,34710,Удивляюсь отзывам про хорошее обслуживание. Бы...


In [32]:
dev_reviews = {}
with open('dev_reviews.txt') as f:
  for line in f:
    line = line.rstrip('\r\n').split('\t')
    dev_reviews[line[0]] = line[1]

In [35]:
def get_data(dev_reviews):
  all_texts = []
  for text_id, text in tqdm(dev_reviews.items()):
    tokens = []
    processed = nlp(text)
    for token in processed.iter_tokens():
      parse = morph.parse(token.text)[0]
      postag = parse.tag.POS
      if postag is None:
        postag = 'PUNKT'
      tokens.append({'text':token.text, 'pos':postag, 'text_id':text_id,
                     'start':token.start_char, 'end':token.end_char})
    all_texts.append(tokens)
  return all_texts

In [36]:
all_texts = get_data(dev_reviews)

100%|██████████| 71/71 [00:20<00:00,  3.48it/s]


In [37]:
test_data = [sent2features(s) for s in all_texts]

In [38]:
dev_pred = crf.predict(test_data)

In [39]:
dev_res = get_marking(dev_pred, test_data)
dev_res['text_id'] = pd.to_numeric(dev_res['text_id'])

In [40]:
dev_res

Unnamed: 0,text_id,tag,text,start,end
0,13823,Whole,""" аппетит """,7,16
1,13823,Service,встретил,138,146
2,13823,Service,менеджер,147,155
3,13823,Service,девушка,179,186
4,13823,Service,проводила к столу,188,205
...,...,...,...,...,...
1147,11770,Food,стейк,831,836
1148,11770,Food,блюдо тартар с сырой рыбой и сырым яйцом,896,936
1149,11770,Service,Официанты,938,947
1150,11770,Service,обстановкая,976,987


In [41]:
with open('dev_pred_aspects.txt', 'w') as f:
  for text_id, tag, text, start, end in zip(dev_res['text_id'], dev_res['tag'], dev_res['text'], dev_res['start'], dev_res['end']):
      print(text_id, tag, text, start, end, sep="\t", file=f)

# Задача 2

In [53]:
logreg = pickle.load(open('task2.sav', 'rb'))

In [55]:
tfidf = pickle.load(open('tfidf.pickle', 'rb'))

In [56]:
dev_aspects_predicted = pd.read_csv('dev_aspects.txt',
                                    delimiter='\t',
                                    names=['text_id', 'category', 'mention', 'start', 'end', 'sentiment'])

In [57]:
dev_texts = pd.read_csv('dev_reviews.txt', delimiter='\t', names=['text_id', 'text'])

In [58]:
def depunct(text):
    return re.sub("[^а-яёА-ЯЁ \-]", "", text)

In [59]:
def text_window(text, start, end, window=7, padding=True):
    after_text = depunct(text[end+1:])
    before_text = depunct(text[:start])
    end_text = ""
    start_text = ""
    if after_text:
        space_counter = 0
        i = 0
        while space_counter < window:
            end_text += after_text[i]
            if i == len(after_text) - 1:
                break
            i += 1
            if after_text[i] == " ":
                space_counter += 1

    if before_text:
        space_counter = 0
        i = -1
        while space_counter < window:
            start_text += before_text[i]
            if i == -len(before_text):
                break
            i -= 1
            if before_text[i] == " ":
                space_counter += 1
    
    end_tokens = end_text.strip().split()
    start_tokens = start_text[::-1].strip().split()
    if padding:
        while len(end_tokens) < window:
            end_tokens.append("PAD")
        while len(start_tokens) < window:
            start_tokens.insert(0, "PAD")
    return start_tokens, end_tokens

In [60]:
pred_aspect_windows = []
for asp in tqdm(dev_aspects_predicted.values):
    text_id = asp[0]
    start = asp[3]
    end = asp[4]
    pred_aspect_windows.append(text_window(list(dev_texts[dev_texts.text_id == text_id].text)[0], start, end, 7))
tfidf_test = tfidf.transform([" ".join(windows[0]+windows[1]) for windows in pred_aspect_windows])

100%|██████████| 1190/1190 [00:01<00:00, 1169.58it/s]


In [61]:
dev_aspects_predicted["sentiment"] = logreg.predict(tfidf_test.toarray())

In [62]:
dev_aspects_predicted.to_csv("dev_pred_aspects.txt", sep="\t", index=False, header=False)

# Задача 3

In [65]:
le = pickle.load(open('encoder.pickle', 'rb'))

In [74]:
features_columns = ['positive', 'negative', 'both', 'neutral']

In [68]:
KNN = pickle.load(open('task3.sav', 'rb'))

In [70]:
def count_sentiments(df_asp, df_sent):
    text_ids = df_asp['text_id'].unique()
    frames = []
    for j in text_ids:
        neutral = {'Whole':0, 'Service':0, 'Food':0, 'Interior':0, 'Price':0}
        frequency = {'Whole':0, 'Service':0, 'Food':0, 'Interior':0, 'Price':0}
        both = {'Whole':0, 'Service':0, 'Food':0, 'Interior':0, 'Price':0}
        positive = {'Whole':0, 'Service':0, 'Food':0, 'Interior':0, 'Price':0}
        negative = {'Whole':0, 'Service':0, 'Food':0, 'Interior':0, 'Price':0}
        asp_slice = df_asp.loc[df_asp['text_id']==j]
        asp_slice = asp_slice.reset_index()
        for i in range(len(asp_slice)):
            if asp_slice['sentiment'][i] == 'neutral':
                neutral[asp_slice['category'][i]] +=1
            if asp_slice['sentiment'][i] == 'positive':
                positive[asp_slice['category'][i]] +=1
            if asp_slice['sentiment'][i] == 'negative':
                negative[asp_slice['category'][i]] +=1
            if asp_slice['sentiment'][i] == 'both':
                both[asp_slice['category'][i]] +=1
        sent_slice = df_sent.loc[df_sent['text_id']==j]
        sent_slice = sent_slice.reset_index()
        sent_slice['positive'] = 0
        sent_slice['negative'] = 0
        sent_slice['both'] = 0
        sent_slice['neutral'] = 0
        for i in range(5):
            sent_slice.at[i,'positive'] = positive[sent_slice['category'][i]]
            sent_slice.at[i, 'negative'] = negative[sent_slice['category'][i]]
            sent_slice.at[i, 'both'] = both[sent_slice['category'][i]]
            sent_slice.at[i, 'neutral'] = neutral[sent_slice['category'][i]]
        frames.append(sent_slice)
    return pd.concat(frames)

In [71]:
mark_asp = pd.read_csv(
    'dev_pred_aspects_ours.txt', 
    delimiter='\t', 
    names=['text_id', 'category', 'mention', 'start', 'end', 'sentiment']
)
mark_cats = pd.read_csv('dev_cats.txt', delimiter='\t', names=['text_id', 'category', 'sentiment'])
makring = count_sentiments(mark_asp, mark_cats)

In [75]:
y_pred = KNN.predict(makring[features_columns])

In [76]:
def get_markings(pred_df, y_pred):
    in_df = pred_df
    y_pred = le.inverse_transform(y_pred)
    pred_df['sentiment'] = 0
    pred_df['sentiment'] = y_pred
    pred_df.drop(['positive', 'negative', 'both', 'neutral', 'index'], axis=1, inplace=True)
    return pred_df

In [77]:
output = get_markings(makring, y_pred)

In [78]:
output

Unnamed: 0,text_id,category,sentiment
0,3976,Food,positive
1,3976,Interior,positive
2,3976,Price,absence
3,3976,Whole,positive
4,3976,Service,positive
...,...,...,...
0,33043,Food,positive
1,33043,Interior,positive
2,33043,Price,absence
3,33043,Whole,positive


In [79]:
output.to_csv('dev_cats_output.txt', sep='\t', header=False, index=False)