# Classification Homework

## Import Libraries

In [79]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score, mean_squared_error
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression, Ridge

## Get the dataset

In [2]:
data = 'Homework_Dataset.csv'

df = pd.read_csv(data)
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


## Prepare Dataset

### change all categorical value into the same format: lowercase and _ instead of space 

In [3]:
categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

df.head().T

Unnamed: 0,0,1,2,3,4
id,2539,2595,3647,3831,5022
name,clean_&_quiet_apt_home_by_the_park,skylit_midtown_castle,the_village_of_harlem....new_york_!,cozy_entire_floor_of_brownstone,entire_apt:_spacious_studio/loft_by_central_park
host_id,2787,2845,4632,4869,7192
host_name,john,jennifer,elisabeth,lisaroxanne,laura
neighbourhood_group,brooklyn,manhattan,manhattan,brooklyn,manhattan
neighbourhood,kensington,midtown,harlem,clinton_hill,east_harlem
latitude,40.64749,40.75362,40.80902,40.68514,40.79851
longitude,-73.97237,-73.98377,-73.9419,-73.95976,-73.94399
room_type,private_room,entire_home/apt,private_room,entire_home/apt,entire_home/apt
price,149,225,150,89,80


### subset new df for only columns we gonna use

In [4]:
new_df = df.loc[:, ['neighbourhood_group',
                    'room_type',
                    'latitude',
                    'longitude',
                    'price',
                    'minimum_nights',
                    'number_of_reviews',
                    'reviews_per_month',
                    'calculated_host_listings_count',
                    'availability_365']]

new_df.head().T

Unnamed: 0,0,1,2,3,4
neighbourhood_group,brooklyn,manhattan,manhattan,brooklyn,manhattan
room_type,private_room,entire_home/apt,private_room,entire_home/apt,entire_home/apt
latitude,40.64749,40.75362,40.80902,40.68514,40.79851
longitude,-73.97237,-73.98377,-73.9419,-73.95976,-73.94399
price,149,225,150,89,80
minimum_nights,1,1,3,1,10
number_of_reviews,9,45,0,270,9
reviews_per_month,0.21,0.38,,4.64,0.1
calculated_host_listings_count,6,2,1,1,1
availability_365,365,355,365,194,0


### Replace Missing values with 0

In [5]:
new_df.isnull().sum()

neighbourhood_group                   0
room_type                             0
latitude                              0
longitude                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

In [6]:
new_df.reviews_per_month = new_df.reviews_per_month.fillna(0)

In [7]:
new_df.isnull().sum()

neighbourhood_group               0
room_type                         0
latitude                          0
longitude                         0
price                             0
minimum_nights                    0
number_of_reviews                 0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
dtype: int64

#### Question 1: 
What is the most frequent observation (mode) for the column 'neighbourhood_group'?

In [9]:
new_df.neighbourhood_group.value_counts()

manhattan        21661
brooklyn         20104
queens            5666
bronx             1091
staten_island      373
Name: neighbourhood_group, dtype: int64

## Split Data

In [11]:
df_full_train, df_test = train_test_split(new_df, test_size=0.2, random_state=42)
df_train, df_validate = train_test_split(df_full_train, test_size=0.25, random_state=42)

print(len(df_train), len(df_validate), len(df_test))

29337 9779 9779


In [12]:
df_train = df_train.reset_index(drop=True)
df_validate = df_validate.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [13]:
y_train = df_train.price.values
y_validate = df_validate.price.values
y_test = df_test.price.values

del df_train['price']
del df_validate['price']
del df_test['price']

## Feature Importance Analysis

#### Question 2:
What are the two features that have the biggest correlation in this dataset?

In [16]:
numerical_features = ['latitude',
                    'longitude',
                    'minimum_nights',
                    'number_of_reviews',
                    'reviews_per_month',
                    'calculated_host_listings_count',
                    'availability_365']

In [23]:
# Create Correlation Matrix
corrMatrix = df_train[numerical_features].corr()
print(corrMatrix)

                                latitude  longitude  minimum_nights  \
latitude                        1.000000   0.080301        0.027441   
longitude                       0.080301   1.000000       -0.060660   
minimum_nights                  0.027441  -0.060660        1.000000   
number_of_reviews              -0.006246   0.055084       -0.076020   
reviews_per_month              -0.007159   0.134642       -0.120703   
calculated_host_listings_count  0.019375  -0.117041        0.118647   
availability_365               -0.005891   0.083666        0.138901   

                                number_of_reviews  reviews_per_month  \
latitude                                -0.006246          -0.007159   
longitude                                0.055084           0.134642   
minimum_nights                          -0.076020          -0.120703   
number_of_reviews                        1.000000           0.590374   
reviews_per_month                        0.590374           1.000000   

In [26]:
# Find the biggest correlation
corrMatrix.unstack().sort_values().drop_duplicates()

reviews_per_month               minimum_nights                   -0.120703
calculated_host_listings_count  longitude                        -0.117041
number_of_reviews               minimum_nights                   -0.076020
calculated_host_listings_count  number_of_reviews                -0.073167
longitude                       minimum_nights                   -0.060660
calculated_host_listings_count  reviews_per_month                -0.048767
latitude                        reviews_per_month                -0.007159
number_of_reviews               latitude                         -0.006246
availability_365                latitude                         -0.005891
latitude                        calculated_host_listings_count    0.019375
minimum_nights                  latitude                          0.027441
number_of_reviews               longitude                         0.055084
latitude                        longitude                         0.080301
longitude                

## Make Price becomes Binary

In [34]:
y_train_bi = (y_train >= 152).astype(int)
y_validate_bi = (y_validate >= 152).astype(int)
y_test_bi = (y_test >= 152).astype(int)

#### Question 3:
Which of these two variables has bigger score?

In [37]:
categorical_features = ['neighbourhood_group', 'room_type']

In [38]:
def mutual_info_price_score(series):
    return mutual_info_score(series, y_train_bi)

In [42]:
mu_in = df_train[categorical_features].apply(mutual_info_price_score)

print(round(mu_in, 2))

neighbourhood_group    0.05
room_type              0.14
dtype: float64


## Train Logistic Regression

### One-hot Encoding

In [44]:
dv = DictVectorizer(sparse=False)

# Encoding categorical variables in training set
train_dict = df_train[categorical_features + numerical_features].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

# Encoding categorical variables in validate set
validate_dict = df_validate[categorical_features + numerical_features].to_dict(orient='records')
X_validate = dv.transform(validate_dict)

### Training the model

In [48]:
model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42)

model.fit(X_train, y_train_bi)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(random_state=42)

In [52]:
y_pred_val = model.predict_proba(X_validate)[:, 1]
y_pred_val

array([0.03282454, 0.56879181, 0.39520619, ..., 0.10037044, 0.0342655 ,
       0.64130469])

#### Question 4:
What is the accuracy on the validation dataset

In [54]:
threshold = (y_pred_val >= 0.5)
acc = (y_validate_bi == threshold).mean()
print(round(acc, 2))

0.79


## Feature Elimination

In [70]:
dv = DictVectorizer(sparse=False)
features = categorical_features + numerical_features
diff_dict = {}

for f in features:
    f_li = categorical_features + numerical_features
    # remove 1 feature
    f_li.remove(f)

    # Encoding categorical variables in training set
    train_dict = df_train[f_li].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)

    # Encoding categorical variables in validate set
    validate_dict = df_validate[f_li].to_dict(orient='records')
    X_validate = dv.transform(validate_dict)

    # Train model
    model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42)

    model.fit(X_train, y_train_bi)

    # get y prediction from validate set
    y_pred_val = model.predict_proba(X_validate)[:, 1]

    # calculate accuracy
    threshold = (y_pred_val >= 0.5)
    acc_without_f = (y_validate_bi == threshold).mean()
    # calculate the difference
    diff = acc - acc_without_f

    diff_dict[f] = diff

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

#### Question 5:
Which feature has the smallest difference?

In [76]:
for di in sorted(diff_dict, key=diff_dict.get, reverse=False):
    print(f'Model without feaure: {di} got difference ==> {diff_dict[di]}')

Model without feaure: number_of_reviews got difference ==> -0.0005112997238981842
Model without feaure: longitude got difference ==> -0.00030677983433891054
Model without feaure: calculated_host_listings_count got difference ==> -0.0002045198895592737
Model without feaure: latitude got difference ==> 0.00010225994477963685
Model without feaure: minimum_nights got difference ==> 0.0008180795582369838
Model without feaure: reviews_per_month got difference ==> 0.0010225994477962574
Model without feaure: availability_365 got difference ==> 0.004908477349422236
Model without feaure: neighbourhood_group got difference ==> 0.03548420083853154
Model without feaure: room_type got difference ==> 0.07015032211882599


# Regression Model with Scikit-Learn

In [81]:
# Apply log to price

y_train_log = np.log1p(y_train)
y_validate_log = np.log1p(y_validate)
y_test_log = np.log1p(y_test)

In [86]:
# Encoding categorical variables in training set
train_dict = df_train[features].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

# Encoding categorical variables in validate set
validate_dict = df_validate[features].to_dict(orient='records')
X_validate = dv.transform(validate_dict)

alpha_li = [0, 0.01, 0.1, 1, 10]
rmse_li = []

# Train model & Calculate RMSE
for a in alpha_li:
    model = Ridge(alpha=a)
    model.fit(X_train, y_train_log)

    y_pred = model.predict(X_validate)

    rmse = mean_squared_error(y_validate_log, y_pred)

    rmse_li.append(rmse)

  return linalg.solve(A, Xy, sym_pos=True,


#### Question 6:
What is the best alpha?

In [89]:
for al, mse in list(zip(alpha_li, rmse_li)):
    print(f'Alpha = {al} ===> RMSE = {round(mse, 3)}')

Alpha = 0 ===> RMSE = 0.247
Alpha = 0.01 ===> RMSE = 0.247
Alpha = 0.1 ===> RMSE = 0.247
Alpha = 1 ===> RMSE = 0.247
Alpha = 10 ===> RMSE = 0.248
