In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('train.csv', encoding='ISO-8859-1')
test = pd.read_csv('test.csv', encoding='ISO-8859-1')
data.head()

Unnamed: 0,Id,Hotel_name,Review_Title,Review_Text,Rating
0,0,Park Hyatt,Refuge in Chennai,Excellent room and exercise facility. All arou...,80.0
1,1,Hilton Chennai,Hilton Chennai,Very comfortable and felt safe. \r\nStaff were...,100.0
2,2,The Royal Regency,No worth the rating shown in websites. Pricing...,Not worth the rating shown. Service is not goo...,71.0
3,3,Rivera,Good stay,"First of all nice & courteous staff, only one ...",86.0
4,4,Park Hyatt,Needs improvement,Overall ambience of the hotel is very good. In...,86.0


In [3]:
len(data)

2351

In [4]:
data.isna().sum()

Id                0
Hotel_name        0
Review_Title    215
Review_Text       0
Rating            0
dtype: int64

In [5]:
test.isna().sum()

Id                0
Hotel_name        0
Review_Title    209
Review_Text       0
dtype: int64

Заполним пропуски и объединим обучающую и тестовую выборки

In [6]:
data.dropna(inplace = True)
test.dropna(inplace = True)

In [7]:
all_data = pd.concat((data, test))

In [8]:
#Распределение отзывов по отелям
all_data.Hotel_name.value_counts()

The Park Chennai                                      116
Accord Metropolitan                                   108
Savera                                                104
Radisson Blu GRT Chennai                               96
Lemon Tree Chennai                                     92
                                                     ... 
Nakshatra Serviced Apartment - Teynampet                1
Sai Enclave Residency                                   1
Pebbles                                                 1
Grand Treat                                             1
Holiday Inn Express Chennai Old Mahabalipuram Road      1
Name: Hotel_name, Length: 253, dtype: int64

In [9]:
#Смотрю на формат отзыва
all_data.Review_Text[0]

0    Excellent room and exercise facility. All arou...
0    On the night of my arrival from NY I had a min...
Name: Review_Text, dtype: object

### Начинаю создавать словарь
1) Чищу отзывы от стоп-слов и ненужных символов

2) применяю Lemmatizer

3) создаю конечный словарь и каждый отзыв превращаю в вектор размера (1, длинна словаря), с помощью one-hot кодирования

In [10]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\proho\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
from nltk.corpus import stopwords
sw_eng = set(stopwords.words('english'))

In [12]:
all_data.Review_Text = list(map(lambda x: ' '.join([word for word in x.split() if not word in sw_eng]), all_data.Review_Text))

In [13]:
import re

expr =  r'[^\w ]'#r'[(,.!)]'
all_data.Review_Text = list(map(lambda x: (re.sub(expr, '', x)).split(), all_data.Review_Text))

In [14]:
# вот что получилось
all_data.Review_Text

0       [Excellent, room, exercise, facility, All, aro...
1       [Very, comfortable, felt, safe, Staff, helpful...
2       [Not, worth, rating, shown, Service, good, Roo...
3       [First, nice, courteous, staff, one, con, stay...
4       [Overall, ambience, hotel, good, In, room, fac...
                              ...                        
2345    [I, definitely, stay, hotel, next, visit, Chen...
2347    [Compared, lemon, tree, stay, bit, disappointi...
2348    [unpleasant, stay, easy, task, reach, Morning,...
2350    [I, sure, someone, wants, spend, kind, money, ...
2351    [I, visited, Chennai, summer, 2014, This, hote...
Name: Review_Text, Length: 4279, dtype: object

In [15]:
# Заполняю словарь
from tqdm import tqdm
dictionary = []

for d in tqdm(all_data.Review_Text):
    dictionary = np.concatenate((dictionary, np.unique(d)))

100%|██████████████████████████████████████████████████████████████████████████████| 4279/4279 [02:06<00:00, 33.85it/s]


In [16]:
len(dictionary)

93569

In [17]:
# Удаляю повторения
dictionary_un = np.unique(dictionary)

In [18]:
len(dictionary_un)

7747

In [19]:
dictionary_un[3000:3030]

array(['charge', 'chargeable', 'charged', 'charger', 'chargers',
       'charges', 'charging', 'charm', 'charming', 'chase', 'chatram',
       'chauffeur', 'cheap', 'cheaper', 'cheapest', 'cheated', 'cheating',
       'check', 'checked', 'checkedin', 'checkers', 'checkin',
       'checkincheckout', 'checking', 'checkinout', 'checkins',
       'checkout', 'chef', 'chefs', 'chemicals'], dtype='<U259')

In [20]:
from nltk import wordnet, pos_tag
def get_wordnet_pos(treebank_tag):
    my_switch = {
        'J': wordnet.wordnet.ADJ,
        'V': wordnet.wordnet.VERB,
        'N': wordnet.wordnet.NOUN,
        'R': wordnet.wordnet.ADV,
    }
    for key, item in my_switch.items():
        if treebank_tag.startswith(key):
            return item
    return wordnet.wordnet.NOUN


In [21]:
from nltk import WordNetLemmatizer
def my_lemmatizer(sent):
    lemmatizer = WordNetLemmatizer()
    tokenized_sent = sent.split()
    pos_tagged = [(word, get_wordnet_pos(tag))
                 for word, tag in pos_tag(tokenized_sent)]
    return ' '.join([lemmatizer.lemmatize(word, tag)
                    for word, tag in pos_tagged])

In [22]:
dictionary_lem = np.unique(my_lemmatizer(' '.join(dictionary_un)).split())

Делаю one-hot кодирование

In [23]:
all_data[dictionary_lem] = 0.0

In [24]:
all_data.drop('Id',axis = 1, inplace = True)

In [25]:
all_data.reset_index(inplace = True) 

In [26]:
from tqdm import tqdm
for j in tqdm(range(len(all_data))):
    a = all_data.loc[j, 'Review_Text']
    a = [word for word in a if not word in sw_eng]
    a = list(map(lambda x: re.sub(expr, '', x), a))
    sentence = []
    for i in a:
        if i != '':
            sentence.append(my_lemmatizer(i))
    all_data.loc[j, sentence] = 1

100%|██████████████████████████████████████████████████████████████████████████████| 4279/4279 [03:26<00:00, 20.72it/s]


In [27]:
all_data.fillna(0,inplace = True)

In [28]:
from sklearn.preprocessing import LabelEncoder

#кодирую название отелей
le = LabelEncoder()
all_data.Hotel_name = le.fit_transform(all_data.Hotel_name)

In [29]:
train = all_data[all_data.Rating != 0]

In [30]:
X = train.drop(['Review_Title', 'Review_Text', 'Rating', 'index'], axis = 1)
y = train['Rating']

In [31]:
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42)
model = CatBoostRegressor(verbose = False)
model.fit(X_train, y_train)
pred = model.predict(X_test)
print('MAE:', mean_absolute_error(y_test, pred))

MAE: 10.706594527101881


MAE составил 10.70

In [32]:
test = all_data[all_data.Rating == 0]
X_t = test.drop(['Review_Title', 'Review_Text', 'Rating', 'index'], axis = 1)

Отзыв и предсказанный рейтинг 

In [41]:
data.loc[2136, 'Review_Text']

'Except bathroom size which is standard for IBIS Hotel around the world. All facility is good, had a breakfast in this hotel, it was okay. This hotel is cost effective and economically feasible under 3K INR. Very good option, mostly used by people or relative working in SIPCOT IT Park as there is no sightseeing nearby. It is good for corporate people.'

In [42]:
model.predict(X_t)[0]

84.45647000174749