In [1]:
# импортируем необходимые билиотеки
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import time

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error

In [2]:
# загрузим данные и посмотрим на первые строки
df = pd.read_csv('AB_NYC_2019.csv')
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [3]:
# выберем признаки 'name', 'price'
data = df[['name','price']]

In [4]:
# посмотрим на данные
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    48879 non-null  object
 1   price   48895 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 764.1+ KB


In [5]:
# посмотрим на характеристики цены
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
price,48895.0,152.720687,240.15417,0.0,69.0,106.0,175.0,10000.0


In [6]:
# удалим наблюдения в которых отсутствуют названия
data.drop(data[data['name'].notna() != True].index, inplace=True)

# удалим наблюдения в которых цена размещения равна 0
data.drop(data[data['price'] == 0].index, inplace=True)

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48868 entries, 0 to 48894
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    48868 non-null  object
 1   price   48868 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.1+ MB


In [8]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
price,48868.0,152.756732,240.202905,10.0,69.0,106.0,175.0,10000.0


In [9]:
# отделим целевой признак от независимого
X, y = data['name'], data['price']

In [10]:
# создадим pipline
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('lr', LogisticRegression()),
])

In [11]:
# посмотрим на среднее значение абсолютной ошибки на кросс-валидации
start_time = time.time()
cvs = cross_val_score(pipeline, X, y, cv=5, n_jobs=-1, scoring='neg_mean_absolute_error')
print(abs(cvs.mean()))
print("-— %s seconds —-" % (time.time() - start_time))

73.0583576865399
-— 1270.8165681362152 seconds —-


In [12]:
# разобьем выборку на train и test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [13]:
# отобразим 5 отелей в train с наименьшей ценой размещения
cheap = X_test.loc[y_test.sort_values().head(5).index]
for i in cheap:
    print(i)

Girls only, cozy room one block from Times Square
Beautiful room in Bushwick
Spacious 2-bedroom Apt in Heart of Greenpoint
Very Spacious bedroom, steps from CENTRAL PARK.
Spacious and Modern 2 Bedroom Apartment


In [14]:
# отобразим 5 отелей в test с наибольшей ценой размещения
expensive = X_test.loc[y_test.sort_values().tail(5).index]
for i in expensive:
    print(i)

SuperBowl Penthouse Loft 3,000 sqft
Luxury townhouse Greenwich Village
Apartment New York 
Hell’s Kitchens
3000 sq ft daylight photo studio
Furnished room in Astoria apartment


In [15]:
# обучим модель на выборке train
pipeline.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('lr', LogisticRegression())])

In [16]:
# посмотрим на среднее значение абсолютной ошибки на тестовой выборке
mean_absolute_error(y_test, pipeline.predict(X_test))

73.67580656162609

In [17]:
# объединим в один датафрейм названия отелей, реальные значения и предсказанные значения цены
result = pd.concat([X_test, y_test], axis=1)
result['predict'] = pipeline.predict(X_test)
result.head()

Unnamed: 0,name,price,predict
35914,Art & Expression Inn,43,125
30501,Private Room in the Heart of Financial District,110,100
14463,Cute bedroom in the East Village,75,100
12249,Penthouse Duplex & Roof Top Terrace,345,250
4996,Bedroom in Williamsburg w/ Terrace,90,80


In [19]:
# в отдельный столбец 'modulo error' внесем разность предсказанных и реальных значений по модулю
result['modulo error'] = abs(result['price'] - result['predict'])

# посмотрим на 5 значений, в которых модель точно предсказала цену
result.sort_values(by=['modulo error']).head(5)

Unnamed: 0,name,price,predict,modulo error
24448,"Sunny room in historic Bedstuy, Brooklyn",60,60,0
7796,*** Gorgeous 2BD apt with Terrace ***,200,200,0
39513,Verona TwentyOne - Kendal Garden,150,150,0
47918,Private Comfortable Room near Subway,50,50,0
8601,"Nice, cozy double size bedroom",50,50,0


In [20]:
# посмотрим на 5 значений, в которых модель плохо предсказала цену
result.sort_values(by=['modulo error']).tail(5)

Unnamed: 0,name,price,predict,modulo error
25825,Fulton 2,5000,60,4940
15560,Luxury townhouse Greenwich Village,6000,300,5700
37194,Apartment New York \nHell’s Kitchens,6500,200,6300
44034,3000 sq ft daylight photo studio,6800,200,6600
9151,Furnished room in Astoria apartment,10000,100,9900


In [22]:
# придумаем несколько названий отелей
names = np.array(['Super luxury hotel', 'Pearl', 'Cheap shelter'])

# сделаем таблицу с предсказаниями модели
pd.concat([pd.DataFrame(names, columns=['name']), pd.DataFrame(pipeline.predict(names), columns=['price'])], axis=1)

Unnamed: 0,name,price
0,Super luxury hotel,199
1,Pearl,70
2,Cheap shelter,50
