https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data?select=AB_NYC_2019.csv

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Image

In [44]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import LinearSVR, SVR
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

In [2]:
Image('../input/new-york-city-airbnb-open-data/New_York_City_.png', width=500, height=400)

In [3]:
data = pd.read_csv('../input/new-york-city-airbnb-open-data/AB_NYC_2019.csv')
data

In [4]:
data.info()

In [5]:
plt.figure(figsize = (12,10))
sns.heatmap(data.isnull())

In [6]:
data.shape

In [7]:
for col in data.columns:
    if data[col].isnull().sum()!=0:
        print(f'{col}: {data[col].isnull().sum()}') 

In [8]:
def preprocessing_1(df):
    df = df.copy()
    
    drop_cols = ['id', 'name','host_id','host_name']
    df = df.drop(drop_cols, axis = 1)
    return df

In [9]:
df1 = preprocessing_1(data)
df1

In [10]:
for col in df1.columns:
    if df1[col].isnull().sum()!=0:
        print(f'{col}: {df1[col].isnull().sum()}') 

### check if both  [last_review, reviews_per_month] has Nans in same place --> can just drop that rows

In [11]:
last_review_nans = df1[df1['last_review'].isnull()]
last_review_nans

In [12]:
reviews_per_month_nans = df1[df1['reviews_per_month'].isnull()]
reviews_per_month_nans

In [13]:
nans_df = pd.concat([last_review_nans,reviews_per_month_nans], axis=0)
nans_df.shape

In [14]:
nans_df[nans_df.duplicated()]

* окшош жерде value жок экен --> Nans rowду drop кылсак экоо тен сразу жок болот

In [15]:
df1 = df1.dropna()

In [16]:
data.shape, df1.shape

In [17]:
df1 = df1.reset_index(drop= True)
df1.head(3)

In [18]:
plt.figure(figsize = (12,10))
sns.heatmap(df1.isnull())

In [19]:
for col in df1.columns:
    if df1[col].isnull().sum()!=0:
        print(f'{col}: {df1[col].isnull().sum()}') 

* Nans are 되어쓰

# Data Visualization

In [20]:
df1['neighbourhood_group'].value_counts()

In [21]:
neigb_top_15_labels = df1['neighbourhood'].value_counts()[:10].keys().tolist()

In [22]:
vals = df1['neighbourhood'].value_counts().keys().tolist()
neigb = df1['neighbourhood'].value_counts().values
vals[:5], neigb[:5]

In [23]:
neigb_top_15_labels = df1['neighbourhood'].value_counts()[:15].keys().tolist()
neigb_top_15_labels

In [24]:
plt.figure(figsize = (18,12))
plt.subplot(1,3,1)
plt.title('Neighbourgood_group')
plt.pie(df1['neighbourhood_group'].value_counts(), 
        labels = ['Manhattan','Brooklyn ','Queens','Bronx ','Staten Island'],autopct='%.2f')

plt.subplot(1,3,3)
plt.title('Neighbourgood_top_15')

plt.pie(sorted(df1['neighbourhood'].value_counts())[:15], 
        labels = neigb_top_15_labels,autopct='%.2f', )

In [25]:
df1.head(2)

In [26]:
pd.pivot_table(df1, index = ['neighbourhood_group','neighbourhood'], values = 'price')

In [27]:
plt.figure(figsize=(12,10))
sns.lineplot(data = df1, x = 'neighbourhood',y='price', hue='neighbourhood_group' ,)

# change object > numeric

In [28]:
obj_cols = df1.select_dtypes('object').columns
[print(f'{col}: {df1[col].isnull().sum()}') for col in obj_cols]

print('\n\n object column unique len \n')

[print(f'{col}: {len(df1[col].unique())}') for col in obj_cols]

In [None]:
# def encode_funct(df):
#     df = df.copy()
    
#     neigbourhood_value_encode = df['neighbourhood_group'].value_counts().keys().tolist()
#     lenth = len(neigbourhood_value_encode)
#     range_lenth = np.arange(lenth)
    
#     for i, data in enumerate(df['neighbourhood_group']):
#         for j, n in enumerate(neigbourhood_value_encode):
#             if data == n:
#                 df.loc[i, 'neighbourhood_group'] = range_lenth[j]
                
    
#     return df

In [29]:
list(set(df1['neighbourhood_group']))

## Making  Label Encode function

In [30]:
def encode_funct(df, col, classes_ = False):
    df = df.copy()
    
    #value_to_be_encoded = df[col].value_counts().keys().tolist()   ==
    value_to_be_encoded = list(set(df1[col]))  # 몇개로 classification을 해야 할지 ex: list(set(df1['neighbourhood_group'])) => ['Queens', 'Bronx', 'Manhattan', 'Brooklyn', 'Staten Island']
    lenth = len(value_to_be_encoded)  #  5
    range_lenth = np.arange(lenth)     # 0, 1, 2, 3, 4
    
    
 
    # encode
    
    for i, data in enumerate(df[col]):
        for j, n in enumerate(value_to_be_encoded):
            if data == n:
                
                df.loc[i,col] = int(range_lenth[j])
               # print(type(df.loc[i,col]))
         
       
    # know which value is converted into what kind of number
    
    classes = dict()
    for i, cl in enumerate(value_to_be_encoded):
        classes[cl] = range_lenth[i]
        
    if classes_ == True:
        return df[col], classes
    
    else: return df[col]

# Encode categorical features

In [31]:
neighbourhood_group_encoded, neighbourhood_group_classes_ = encode_funct(df1, 'neighbourhood_group',  classes_ = True)

In [32]:
neighbourhood_encoded, neighbourhood_classes_ = encode_funct(df1, 'neighbourhood',  classes_ = True)

In [33]:
room_type_encoded, room_type_classes_ = encode_funct(df1, 'room_type', classes_  = True)

#### encoded features are object type --> change to int

In [34]:
neighbourhood_group_encoded = neighbourhood_group_encoded.astype(dtype= int)
neighbourhood_encoded = neighbourhood_encoded.astype(dtype = int)
room_type_encoded = room_type_encoded.astype(dtype = int)

In [35]:
print('[ENCODED CLASSES_]: ',neighbourhood_group_classes_,neighbourhood_classes_, room_type_classes_, sep = '\n \n')

In [36]:
df2 = df1.copy()
df2

In [37]:
df2['neighbourhood_group'] = neighbourhood_group_encoded
df2['neighbourhood'] = neighbourhood_encoded
df2['room_type'] = room_type_encoded

In [38]:
neighbourhood_group_encoded

In [39]:
df2

### last_review --> separate year & month

In [41]:
df3 = df2.copy()
df3['last_review_year'] = pd.DatetimeIndex(df3['last_review']).year
df3['last_review_month'] = pd.DatetimeIndex(df3['last_review']).month
df3.drop('last_review', axis = 1, inplace = True)

In [42]:
df3.info()

* object>> numerical
* correlation
* model

In [43]:
df3.head(3)

### normalization

In [53]:
def preprocess_final(ds, scaler = False):
    ds = ds.copy()
    
    X = ds.drop(['price'], axis = 1)
    y = ds.price
    
    if scaler:
        X = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)
    
    
    return X,y

In [58]:
# Without scaling

plt.figure(figsize = (18,14))

for i, col in enumerate(X.columns):
    
    plt.subplot(3,4, i+1)
    sns.boxplot(X[col])

In [60]:
X,y = preprocess_final(df3, scaler = StandardScaler())

In [61]:
# With standard scaling

plt.figure(figsize = (18,14))

for i, col in enumerate(X.columns):
    
    plt.subplot(3,4, i+1)
    sns.boxplot(X[col])

In [63]:
X,y = preprocess_final(df3, scaler = MinMaxScaler())

# With MinMax scaling

plt.figure(figsize = (18,14))

for i, col in enumerate(X.columns):
    
    plt.subplot(3,4, i+1)
    sns.boxplot(X[col])

# Train test

### 1. without scaling

In [68]:
X,y = preprocess_final(df3)

X_train, X_test,  y_train,y_test = train_test_split(X,y, train_size = 0.8, random_state = 2)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

## # regression problem

In [66]:
models = {
    '                    Linear Regression': LinearRegression(),
    '                  KNeighborsRegressor':KNeighborsRegressor(n_neighbors=10),
    'Linear Regression (L2 Regularization)': Ridge(),
    'Linear Regression (L! Regularization)': Lasso(),
    '                       Neural Network': MLPRegressor(),
    'Support Vector Machine(Linear Kernel)': LinearSVR(),
    '   Support Vector Machine(RBF Kernel)': SVR(),
    '                        Decision Tree': DecisionTreeRegressor(),
    '                        Random Forest': RandomForestRegressor(),
    '                    Gradient Boosting': GradientBoostingRegressor(),
    '                              XGBoost': XGBRegressor(),
    '                             LightGBM': LGBMRegressor(),
    '                             CatBoost': CatBoostRegressor(),
    
}

In [69]:
for name, model in models.items():
    model.fit(X_train, y_train)
    print('%s trained '%name)

In [72]:
from sklearn.metrics import r2_score

for name, model in models.items():
    y_pred = model.predict(X_test)
    score = r2_score(y_test, y_pred)
    print('%s accuracy: %.2f'%(name, score*100))

### 2. Standard Scaling

In [73]:
X,y = preprocess_final(df3, StandardScaler())

X_train, X_test,  y_train,y_test = train_test_split(X,y, train_size = 0.8, random_state = 2)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [74]:
for name, model in models.items():
    model.fit(X_train, y_train)
    print('%s trained '%name)

In [75]:
from sklearn.metrics import r2_score

for name, model in models.items():
    y_pred = model.predict(X_test)
    score = r2_score(y_test, y_pred)
    print('%s accuracy: %.2f %'%(name, score*100))

# 3. MinMax Scaling

In [77]:
X,y = preprocess_final(df3, MinMaxScaler())

X_train, X_test,  y_train,y_test = train_test_split(X,y, train_size = 0.8, random_state = 2)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [78]:
for name, model in models.items():
    model.fit(X_train, y_train)
    print('%s trained '%name)

In [80]:
from sklearn.metrics import r2_score

for name, model in models.items():
    y_pred = model.predict(X_test)
    score = r2_score(y_test, y_pred)
    print('%s accuracy: %.2f '%(name, score*100)+'%')

# Result:
1. Without scalingMinMaxScaler* Random Forest accuracy: 14.13
> * LightGBM accuracy: 17.20

2. Standard scaling
> *   LightGBM accuracy: 17.42
> * CatBoost accuracy: 15.21
3. MinMax scaling
> * LightGBM accuracy: 16.76 %
> * CatBoost accuracy: 15.21 %