In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#sci kit
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

import warnings
warnings.filterwarnings('ignore')

  import pandas.util.testing as tm


In [4]:
df = pd.read_csv("AB_NYC_2019.csv")
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [5]:
print(df.shape)

(48895, 16)


In [6]:
# checking the columns for the null values for the EDA and preprocessing
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              48895 non-null  int64  
 1   name                            48879 non-null  object 
 2   host_id                         48895 non-null  int64  
 3   host_name                       48874 non-null  object 
 4   neighbourhood_group             48895 non-null  object 
 5   neighbourhood                   48895 non-null  object 
 6   latitude                        48895 non-null  float64
 7   longitude                       48895 non-null  float64
 8   room_type                       48895 non-null  object 
 9   price                           48895 non-null  int64  
 10  minimum_nights                  48895 non-null  int64  
 11  number_of_reviews               48895 non-null  int64  
 12  last_review                     

In [7]:
df.isnull().sum()

id                                    0
name                                 16
host_id                               0
host_name                            21
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

In [8]:
losses_mean = df["reviews_per_month"].mean()
# replacing nan values with mean values
df["reviews_per_month"].fillna(losses_mean,inplace=True)

# replacing nan values with another categorical value
df["last_review"].fillna('unknown',inplace=True)

In [9]:
df.isnull().sum()

id                                 0
name                              16
host_id                            0
host_name                         21
neighbourhood_group                0
neighbourhood                      0
latitude                           0
longitude                          0
room_type                          0
price                              0
minimum_nights                     0
number_of_reviews                  0
last_review                        0
reviews_per_month                  0
calculated_host_listings_count     0
availability_365                   0
dtype: int64

In [10]:
# creating a two new dataframes, seperating the numeric values and objects
df_num = df.select_dtypes(["int64","float64"])
df_cat = df.select_dtypes("object")

In [11]:
df_num.drop(["id","host_id"],axis=1,inplace=True)
df_num.head()

Unnamed: 0,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,40.64749,-73.97237,149,1,9,0.21,6,365
1,40.75362,-73.98377,225,1,45,0.38,2,355
2,40.80902,-73.9419,150,3,0,1.373221,1,365
3,40.68514,-73.95976,89,1,270,4.64,1,194
4,40.79851,-73.94399,80,10,9,0.1,1,0


In [12]:
df_cat.drop(["name","host_name",'last_review'],axis=1,inplace=True)
df_cat.head()

Unnamed: 0,neighbourhood_group,neighbourhood,room_type
0,Brooklyn,Kensington,Private room
1,Manhattan,Midtown,Entire home/apt
2,Manhattan,Harlem,Private room
3,Brooklyn,Clinton Hill,Entire home/apt
4,Manhattan,East Harlem,Entire home/apt


In [13]:
for feature in df_cat.columns:
    print('The feature is {} and number of categories are {}'.format(feature,len(df[feature].unique())))

The feature is neighbourhood_group and number of categories are 5
The feature is neighbourhood and number of categories are 221
The feature is room_type and number of categories are 3


In [14]:
from sklearn.preprocessing import LabelEncoder

# Looping each column name for encoding purpose
for col in df_cat:
    le = LabelEncoder()
    # fit_transform function converts all the words into numbers
    df_cat[col] = le.fit_transform(df_cat[col])

In [15]:
# concating both the dataframes with columns into a new dataframe 
df_new = pd.concat([df_num,df_cat],axis=1)
df_new.head()

Unnamed: 0,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,neighbourhood_group,neighbourhood,room_type
0,40.64749,-73.97237,149,1,9,0.21,6,365,1,108,1
1,40.75362,-73.98377,225,1,45,0.38,2,355,2,127,0
2,40.80902,-73.9419,150,3,0,1.373221,1,365,2,94,1
3,40.68514,-73.95976,89,1,270,4.64,1,194,1,41,0
4,40.79851,-73.94399,80,10,9,0.1,1,0,2,61,0


In [16]:
# here x axis will use all the columns apart from price.
X = df_new.drop("price",axis=1)
y = df["price"]

In [17]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.3,
                                                 random_state=1)

In [23]:
from sklearn.preprocessing import PolynomialFeatures

# Here the parameter passed in the PolynomialFeature is the degree
pf = PolynomialFeatures(3)
x_poly = pf.fit_transform(X)
x_train, x_test, y_train, y_test = train_test_split(x_poly, 
                        y, random_state=1,test_size=0.3)

In [24]:
lr= LinearRegression()
lr.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [25]:
y_pred = lr.predict(x_test)
print(mean_squared_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

54596.25723251098
0.07434846007880513


In [None]:
# # Implementing libraries for Lasso and Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

In [None]:
for i in range(100,500,50):
    l1 = Lasso(alpha=i)
    l1.fit(x_train,y_train)
    print(i,":",l1.score(x_test,y_test))

100 : -119.60670625159693



KeyboardInterrupt



In [65]:
#Fitting of Weak Classifier
# Weak Learner or weak classifier means the algorithm that consists of error or mistakes
from sklearn.tree import DecisionTreeRegressor

dt_clf = DecisionTreeRegressor(criterion='mse',max_features='log2',max_depth=16,random_state=0)
dt_clf.fit(x_train,y_train)
dt_score = dt_clf.score(x_test,y_test)
print("decision Tree score: ",dt_score)
dt_clf.predict(x_test)
y_test

xgb_score = xgb_clf.score(x_test,y_test)
print("XGBoost Score: ",xgb_score)

decision Tree score:  -0.8267806725000804


18907     60
46663    250
19757     80
9705      95
3322     450
        ... 
12360     85
46855    150
37553    110
15102     72
34586    140
Name: price, Length: 14669, dtype: int64

In [71]:
from sklearn import datasets, ensemble


params = {'n_estimators': 200,
          'max_depth': 100,
          'min_samples_split': 10,
          'learning_rate': 0.000001,
          'loss': 'ls'}

reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(x_train, y_train)

mse = mean_squared_error(y_test, reg.predict(x_test))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

The mean squared error (MSE) on test set: 58976.3527


In [72]:
from sklearn.svm import SVR

svr_rbf = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=.1)
svr_rbf.fit(x_train,y_train)
svr_score =svr_rbf.score(x_test,y_test)
print(svr_score)

KeyboardInterrupt: ignored