<a href="https://colab.research.google.com/github/arpitchittora/arpit-ml-zoomcamp/blob/main/Classification_(ML_Zoomcamp).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [3]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/AB_NYC_2019.csv'

In [4]:
!wget $data -O data-week-3.csv

--2021-09-25 11:07:15--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/AB_NYC_2019.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7077973 (6.8M) [text/plain]
Saving to: ‘data-week-3.csv’


2021-09-25 11:07:16 (173 MB/s) - ‘data-week-3.csv’ saved [7077973/7077973]



In [5]:
df = pd.read_csv('data-week-3.csv')

In [6]:
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [7]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [8]:
base = ['neighbourhood_group',
'room_type',
'latitude',
'longitude',
'price',
'minimum_nights',
'number_of_reviews',
'reviews_per_month',
'calculated_host_listings_count',
'availability_365']

In [9]:
df[base].isnull().sum()

neighbourhood_group                   0
room_type                             0
latitude                              0
longitude                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

In [10]:
df = df[base].copy()

In [11]:
df.reviews_per_month = df.reviews_per_month.fillna(0).values

In [12]:
df.isnull().sum()

neighbourhood_group               0
room_type                         0
latitude                          0
longitude                         0
price                             0
minimum_nights                    0
number_of_reviews                 0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
dtype: int64

In [13]:
df.groupby('neighbourhood_group').price.agg(['count'])

Unnamed: 0_level_0,count
neighbourhood_group,Unnamed: 1_level_1
bronx,1091
brooklyn,20104
manhattan,21661
queens,5666
staten_island,373


In [14]:
df['above_average'] = np.where(df.price >= 152, 1, 0)
df['above_average']

0        0
1        1
2        0
3        0
4        0
        ..
48890    0
48891    0
48892    0
48893    0
48894    0
Name: above_average, Length: 48895, dtype: int64

**Setting up the validation framework**

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [17]:
len(df_train), len(df_val), len(df_test)

(29337, 9779, 9779)

In [18]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [19]:
y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values

del df_train['above_average']
del df_val['above_average']
del df_test['above_average']

In [20]:
df_full_train.head()

Unnamed: 0,neighbourhood_group,room_type,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,above_average
32645,brooklyn,entire_home/apt,40.71577,-73.9553,295,3,11,0.87,1,1,1
23615,manhattan,private_room,40.84917,-73.94048,70,2,2,0.16,1,0,0
31183,brooklyn,private_room,40.68993,-73.95947,58,2,0,0.0,2,0,0
29260,brooklyn,entire_home/apt,40.68427,-73.93118,75,3,87,4.91,1,267,0
7275,queens,private_room,40.74705,-73.89564,38,5,13,0.25,1,0,0


Feature importance: Correlation

In [21]:
numerical = [
    'latitude',
    'longitude',
    'minimum_nights',
    'number_of_reviews',
    'reviews_per_month',
    'calculated_host_listings_count',
    'availability_365'
]

In [22]:
corrM = df_train[numerical].corr()
corrM

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
latitude,1.0,0.080301,0.027441,-0.006246,-0.007159,0.019375,-0.005891
longitude,0.080301,1.0,-0.06066,0.055084,0.134642,-0.117041,0.083666
minimum_nights,0.027441,-0.06066,1.0,-0.07602,-0.120703,0.118647,0.138901
number_of_reviews,-0.006246,0.055084,-0.07602,1.0,0.590374,-0.073167,0.174477
reviews_per_month,-0.007159,0.134642,-0.120703,0.590374,1.0,-0.048767,0.165376
calculated_host_listings_count,0.019375,-0.117041,0.118647,-0.073167,-0.048767,1.0,0.225913
availability_365,-0.005891,0.083666,0.138901,0.174477,0.165376,0.225913,1.0


Feature importance: Mutual information

In [23]:
from sklearn.metrics import mutual_info_score

In [24]:
categorical = [
    'neighbourhood_group',
    'room_type',
]

In [25]:
def mutual_info_price_score(series):
    return round(mutual_info_score(series, y_train), 2)

In [26]:
mi = df_train[categorical].apply(mutual_info_price_score)
mi.sort_values(ascending=False)

room_type              0.14
neighbourhood_group    0.05
dtype: float64

**One-hot encoding**

In [27]:
from sklearn.feature_extraction import DictVectorizer

In [28]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

**Logistic regression**

In [29]:
from sklearn.linear_model import LogisticRegression

In [30]:
model = LogisticRegression(solver='liblinear', C=1.0, random_state=42)

In [31]:
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [32]:
y_pred = model.predict_proba(X_val)[:, 1]

In [33]:
price_decision = (y_pred >= 0.5)

In [34]:
accuracy = round((price_decision == y_val).mean(),2)

In [35]:
accuracy

0.79

**Feature elimination technique**

In [36]:
features = [
            'neighbourhood_group',
            'room_type',
            'number_of_reviews',
            'reviews_per_month',
]

In [37]:
def lg_reg_technique(exclue_feature):
  dv = DictVectorizer(sparse=False)

  df_temp = df_train[categorical + numerical]
  df_temp.drop([exclue_feature], axis=1, inplace=True)

  train_dict = df_temp.to_dict(orient='records')
  X_train = dv.fit_transform(train_dict)

  df_val_temp = df_val[categorical + numerical]
  df_val_temp.drop([exclue_feature], axis=1, inplace=True)

  val_dict = df_val_temp.to_dict(orient='records')
  X_val = dv.transform(val_dict)

  lg_model = LogisticRegression(solver='liblinear', C=1.0, random_state=42)
  lg_model.fit(X_train, y_train)

  lg_y_pred = lg_model.predict_proba(X_val)[:, 1]

  lg_price_decision = (lg_y_pred >= 0.5)
  lg_accuracy = round((lg_price_decision == y_val).mean(),2)
  print()
  print(exclue_feature, lg_accuracy)
  print()
  return lg_accuracy

In [38]:
accuracy_arr = []
for f in features:
  print(f)
  model_accuracy = lg_reg_technique(f)
  accuracy_arr.append({'feature':f,'model_accuracy':model_accuracy,'accuracy_diff':(model_accuracy-accuracy)})

neighbourhood_group


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,



neighbourhood_group 0.75

room_type


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,



room_type 0.73

number_of_reviews


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,



number_of_reviews 0.79

reviews_per_month


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,



reviews_per_month 0.79



In [39]:
accuracy_arr

[{'accuracy_diff': -0.040000000000000036,
  'feature': 'neighbourhood_group',
  'model_accuracy': 0.75},
 {'accuracy_diff': -0.06000000000000005,
  'feature': 'room_type',
  'model_accuracy': 0.73},
 {'accuracy_diff': 0.0,
  'feature': 'number_of_reviews',
  'model_accuracy': 0.79},
 {'accuracy_diff': 0.0,
  'feature': 'reviews_per_month',
  'model_accuracy': 0.79}]

**Ridge regression model**

In [40]:
df.head()

Unnamed: 0,neighbourhood_group,room_type,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,above_average
0,brooklyn,private_room,40.64749,-73.97237,149,1,9,0.21,6,365,0
1,manhattan,entire_home/apt,40.75362,-73.98377,225,1,45,0.38,2,355,1
2,manhattan,private_room,40.80902,-73.9419,150,3,0,0.0,1,365,0
3,brooklyn,entire_home/apt,40.68514,-73.95976,89,1,270,4.64,1,194,0
4,manhattan,entire_home/apt,40.79851,-73.94399,80,10,9,0.1,1,0,0


In [41]:
y_train = np.log1p(df_train.price.values)
y_val = np.log1p(df_val.price.values)
y_test = np.log1p(df_test.price.values)

del df_train['price']
del df_val['price']
del df_test['price']

In [44]:
from sklearn.linear_model import Ridge

In [53]:
def calculate_ridge_rsme(alpha):
  train_dict = df_train[categorical + numerical].to_dict(orient='records')
  X_train = dv.fit_transform(train_dict)

  val_dict = df_val[categorical + numerical].to_dict(orient='records')
  X_val = dv.transform(val_dict)

  model = Ridge(alpha=alpha)
  model.fit(X_train,y_train)

  y_pred = model.predict(X_val)
  print(y_pred)

  se = (y_val - y_pred) ** 2
  mse = se.mean()
  error = np.sqrt(mse)

  print()
  print(alpha, error)
  print()
  return error

In [56]:
rsme_err = []
for a in [0, 0.01, 0.1, 1, 10]:
  err = calculate_ridge_rsme(a)
  rsme_err.append(round(err,3))

rsme_err

[4.11694336 5.21264648 5.00561523 ... 4.56420898 4.19702148 5.14038086]

0 0.4971901903517505

[4.11405448 5.21395699 5.01012603 ... 4.5595196  4.19404933 5.14435076]

0.01 0.4971173046190633

[4.1142478  5.21378968 5.01017209 ... 4.55940994 4.19400898 5.14469281]

0.1 0.49711832446943977

[4.116111   5.21218878 5.01060186 ... 4.55832635 4.19362144 5.14801035]

1 0.49713953633200486

[4.12973827 5.20108005 5.01293916 ... 4.54893373 4.19086998 5.17337529]

10 0.4978866015876555



[0.497, 0.497, 0.497, 0.497, 0.498]