In [1]:
import pandas as pd
import tensorflow as tf
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [2]:
ny_ab = pd.read_csv('AB_NYC_2019.csv')
ny_ab.head()
ny_ab.drop(['host_name','name','latitude','longitude','last_review','id','host_id'], axis=1, inplace=True)
ny_ab['reviews_per_month'] = ny_ab['reviews_per_month'].fillna(0)

In [3]:

ny_ab.head()

Unnamed: 0,neighbourhood_group,neighbourhood,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,Brooklyn,Kensington,Private room,149,1,9,0.21,6,365
1,Manhattan,Midtown,Entire home/apt,225,1,45,0.38,2,355
2,Manhattan,Harlem,Private room,150,3,0,0.0,1,365
3,Brooklyn,Clinton Hill,Entire home/apt,89,1,270,4.64,1,194
4,Manhattan,East Harlem,Entire home/apt,80,10,9,0.1,1,0


In [4]:
ny_ab.describe()
ny_ab.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 9 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   neighbourhood_group             48895 non-null  object 
 1   neighbourhood                   48895 non-null  object 
 2   room_type                       48895 non-null  object 
 3   price                           48895 non-null  int64  
 4   minimum_nights                  48895 non-null  int64  
 5   number_of_reviews               48895 non-null  int64  
 6   reviews_per_month               48895 non-null  float64
 7   calculated_host_listings_count  48895 non-null  int64  
 8   availability_365                48895 non-null  int64  
dtypes: float64(1), int64(5), object(3)
memory usage: 3.4+ MB


In [5]:
categorical_features = ny_ab.select_dtypes(include=['object'])
categorical_features_one_hot = pd.get_dummies(categorical_features)
categorical_features_one_hot.head()

Unnamed: 0,neighbourhood_group_Bronx,neighbourhood_group_Brooklyn,neighbourhood_group_Manhattan,neighbourhood_group_Queens,neighbourhood_group_Staten Island,neighbourhood_Allerton,neighbourhood_Arden Heights,neighbourhood_Arrochar,neighbourhood_Arverne,neighbourhood_Astoria,...,neighbourhood_Williamsburg,neighbourhood_Willowbrook,neighbourhood_Windsor Terrace,neighbourhood_Woodhaven,neighbourhood_Woodlawn,neighbourhood_Woodrow,neighbourhood_Woodside,room_type_Entire home/apt,room_type_Private room,room_type_Shared room
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [6]:
print(ny_ab.columns.str.contains('price')==True)
#取出未包含price的columns
print(ny_ab.columns[ny_ab.columns.str.contains('price')==False])

[False False False  True False False False False False]
Index(['neighbourhood_group', 'neighbourhood', 'room_type', 'minimum_nights',
       'number_of_reviews', 'reviews_per_month',
       'calculated_host_listings_count', 'availability_365'],
      dtype='object')


In [8]:
#通過將每個要素縮放到給定範圍來變換要素。
min_max_scaler = preprocessing.MinMaxScaler()
#print(ny_ab.columns.str.contains('price')==True)
#print(ny_ab.columns[ny_ab.columns.str.contains('price')==False])
#print(ny_ab[ny_ab.columns[ny_ab.columns.str.contains('price')==False]])
ny_ab.drop(['neighbourhood_group','neighbourhood','room_type'], axis=1, inplace=True)
#print(ny_ab)
finalpd=pd.concat([ny_ab,categorical_features_one_hot],axis=1)
#結合onehot encode的columns
print(finalpd)
x_scaled = min_max_scaler.fit_transform(ny_ab[ny_ab.columns[ny_ab.columns.str.contains('price')==False]])
print(x_scaled)
#assign回表格
ny_ab[ny_ab.columns[ny_ab.columns.str.contains('price')==False]] = x_scaled

       price  minimum_nights  number_of_reviews  reviews_per_month  \
0        149               1                  9               0.21   
1        225               1                 45               0.38   
2        150               3                  0               0.00   
3         89               1                270               4.64   
4         80              10                  9               0.10   
...      ...             ...                ...                ...   
48890     70               2                  0               0.00   
48891     40               4                  0               0.00   
48892    115              10                  0               0.00   
48893     55               1                  0               0.00   
48894     90               7                  0               0.00   

       calculated_host_listings_count  availability_365  \
0                                   6               365   
1                                   2    

In [74]:
print(finalpd)

        price  minimum_nights  number_of_reviews  reviews_per_month  \
0      0.0149               1                  9               0.21   
1      0.0225               1                 45               0.38   
2      0.0150               3                  0               0.00   
3      0.0089               1                270               4.64   
4      0.0080              10                  9               0.10   
...       ...             ...                ...                ...   
48890  0.0070               2                  0               0.00   
48891  0.0040               4                  0               0.00   
48892  0.0115              10                  0               0.00   
48893  0.0055               1                  0               0.00   
48894  0.0090               7                  0               0.00   

       calculated_host_listings_count  availability_365  \
0                                   6               365   
1                            

In [75]:
X_train, X_test, y_train, y_test = train_test_split(ny_ab[ny_ab.columns[ny_ab.columns.str.contains('price')==False]] , ny_ab['price'] , test_size=0.1, random_state=66)

In [76]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(256,activation=tf.nn.relu),
    tf.keras.layers.Dense(128,activation=tf.nn.relu),
    tf.keras.layers.Dense(64,activation=tf.nn.relu),
    tf.keras.layers.Dense(32,activation=tf.nn.relu),
    tf.keras.layers.Dense(16,activation=tf.nn.relu),
    tf.keras.layers.Dense(1,activation=tf.nn.relu)
])
model.compile(optimizer='adam',
                loss='mean_squared_error',
                metrics=['mean_squared_error'])

history = model.fit(X_train.values ,y_train.values, epochs=10, validation_split = 0.1)

Train on 39604 samples, validate on 4401 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
