# Загрузка Pandas и очистка данных

In [374]:
import pandas as pd
import numpy as np
import re
import math

df = pd.read_csv('main_task.csv')

## Restaurant_id

In [375]:
# Selecting number from ID
df['Restaurant_id'] = df['Restaurant_id'].apply(lambda x: re.findall(r'\d+', str(x))[0])

In [376]:
# Changing the type of data in the column
df['Restaurant_id'] = df['Restaurant_id'].astype(int)

## City

In [377]:
# Dummy- Creation
city = pd.get_dummies(df['City'], drop_first=False)

In [378]:
# Concatination in a DataFrame
df = pd.concat([df, city], axis=1)

## Cuisine Style

In [379]:
# Empty values replace with "Uknown":
df['Cuisine Style'] = df['Cuisine Style'].fillna("Uknown")

In [380]:
# Using the Formatting Function for Column Values:
def list_cuisine(cell):
    """The function formates values in the column's cells"""
    cell = str(cell)
    cell = cell.replace("[", "")
    cell = cell.replace("]", "")
    cell = cell.replace("'", "")
    cell = cell.split(', ')
    return cell


df['Cuisine Style'] = df['Cuisine Style'].apply(list_cuisine)

In [381]:
# TASK -- Какое среднее количество кухонь предлагается в одном ресторане?
round(df['Cuisine Style'].apply(lambda x: len(x)).mean(), 1)

2.6

In [382]:
# TASK -- Сколько типов кухонь представлено в наборе данных (без "Uknown")?
df = df.explode('Cuisine Style')
len(df['Cuisine Style'].value_counts())-1

125

In [383]:
# TASK -- Какая кухня представлена в наибольшем количестве ресторанов?
df = df.explode('Cuisine Style')
df['Cuisine Style'].value_counts().index[0]

'Vegetarian Friendly'

In [384]:
# Dummy- Creation:
cuisine = pd.get_dummies(df['Cuisine Style'], drop_first=False)

In [385]:
# Concatination in a DataFrame
df = pd.concat([df, cuisine], axis=1)

In [386]:
# Feature Engeneering - number of cuisines in a restaurant
df['Cuisine Style'] = df['Cuisine Style'].apply(lambda x: len(x))

## Ranking and Rating

In [387]:
# Feature Engeneering - "Ranking" on "Rating" division
df['Rating/Ranking'] = df['Rating'] / df['Ranking']

## Price Range

In [388]:
# Замена значений Price Range на числовые:
dict_price_range = {'$': 1, '$$ - $$$': 2, '$$$$': 3}
df['Price Range'] = df['Price Range'].replace(to_replace=dict_price_range)

In [389]:
# Замена пустых значений Price Range на наиболее популярные значения:     
df['Price Range'] = df['Price Range'].\
fillna(df['Price Range'].value_counts().index[0])

In [390]:
# Dummy- Creation
price = pd.get_dummies(df['Price Range'], drop_first=False)

In [391]:
# Concatination in a DataFrame
df = pd.concat([df, price], axis=1)

## Number of Reviews

In [392]:
# Filling in the gaps with average number of reviews
df['Number of Reviews'] = df['Number of Reviews'].fillna(round(df['Number of Reviews'].mean()))

In [393]:
# Changing the type of data in the column
df['Number of Reviews'] = df['Number of Reviews'].astype(int)

## Reviews

In [394]:
# Remove empty cells
df = df[df['Reviews'] != '[[], []]']

In [395]:
# Feature Engeneering - length of the review
df['Review_len'] = df['Reviews'].apply(lambda x: len(x))

In [396]:
# Find only data's numbers
df['Reviews'] = df['Reviews'].apply(lambda x: re.findall(r'\d+/\d+/\d+', str(x)))

In [397]:
# TASK -- The latest review
# df = df.explode('Reviews')
# df['Reviews'] = pd.to_datetime(df['Reviews'])
# df['Reviews'].max()

In [398]:
# New columns for 1st and 2nd review
df['1-Review'] = df['Reviews'].apply(lambda x: x[0])
df['2-Review'] = df['Reviews'].apply(lambda x: x[-1])

In [399]:
# Conversion to date
df['1-Review'] = pd.to_datetime(df['1-Review'])
df['2-Review'] = pd.to_datetime(df['2-Review'])

In [400]:
# Number of dates between reviews
df['Delta-Review'] = (abs(df['2-Review'] - df['1-Review'])).dt.days

In [401]:
# TASK -- Max number of dates between reviews
df['Delta-Review'].sort_values(ascending=False).iloc[0]

3207

## URL_TA

In [402]:
# String editing (looking for the restaurant's name)
df['URL_TA'] = df['URL_TA'].apply(lambda x: re.findall(r'-\D+-', str(x))[0])
df['URL_TA'] = df['URL_TA'].apply(lambda x: x.replace('-Reviews-', '')[:-1])

In [403]:
# Feature Engeneering - length of restaurant's name
df['URL_TA'] = df['URL_TA'].apply(lambda x: len(x))

## Rest Unused Columns

In [404]:
# Deletion
df = df.drop(['City', 'Price Range', 'Reviews',\
              'ID_TA', '1-Review', '2-Review'], axis=1)

## Normalization

In [405]:
df['Cuisine Style'] = np.log(df['Cuisine Style'])
df['Ranking'] = np.log(df['Ranking'])
df['Rating'] = np.log(df['Rating'])
df['Number of Reviews'] = np.log(df['Number of Reviews'])
df['Rating/Ranking'] = np.log(df['Rating/Ranking'])
df['Review_len'] = np.log(df['Review_len'])

# Разбиваем датафрейм на части, необходимые для обучения и тестирования модели

In [406]:
# Х - данные с информацией о ресторанах, у - целевая переменная (рейтинги ресторанов)
X = df.drop(['Restaurant_id', 'Rating'], axis = 1)
y = df['Rating']

In [407]:
# Загружаем специальный инструмент для разбивки:
from sklearn.model_selection import train_test_split

In [408]:
# Наборы данных с меткой "train" будут использоваться для обучения модели, "test" - для тестирования.
# Для тестирования мы будем использовать 25% от исходного датасета.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# Создаём, обучаем и тестируем модель

In [409]:
# Импортируем необходимые библиотеки:
from sklearn.ensemble import RandomForestRegressor # инструмент для создания и обучения модели
from sklearn import metrics # инструменты для оценки точности модели

In [410]:
# Создаём модель
regr = RandomForestRegressor(n_estimators=100)

# Обучаем модель на тестовом наборе данных
regr.fit(X_train, y_train)

# Используем обученную модель для предсказания рейтинга ресторанов в тестовой выборке.
# Предсказанные значения записываем в переменную y_pred
y_pred = regr.predict(X_test)

In [411]:
# Сравниваем предсказанные значения (y_pred) с реальными (y_test), и смотрим насколько они в среднем отличаются
# Метрика называется Mean Absolute Error (MAE) и показывает среднее отклонение предсказанных значений от фактических.
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))

MAE: 0.0023586698003439134
