# DS-7331 Machine Learning Mini Project
## Airbnb Price Data - Logistic and SVM
### Allen Miller, Ana Glaser, Jake Harrison, Lola Awodipe

https://nbviewer.jupyter.org/github/allenmiller17/SMU_7331_ML1_Project_1/blob/main/Project_1_Final_Write_Up.ipynb

In [1]:
import warnings
#warnings.filterwarnings('ignore')

import pandas as pd
from sklearn.linear_model import LogisticRegression

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

from sklearn.model_selection import train_test_split

import numpy as np
from scipy import stats

from sklearn import metrics

In [2]:
#importing the data
pdata = pd.read_csv("airbnb.csv")
pdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74111 entries, 0 to 74110
Data columns (total 27 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      74111 non-null  int64  
 1   price                   74111 non-null  float64
 2   log_price               74111 non-null  float64
 3   property_type           74111 non-null  object 
 4   room_type               74111 non-null  object 
 5   accommodates            74111 non-null  int64  
 6   bathrooms               73911 non-null  float64
 7   bed_type                74111 non-null  object 
 8   cancellation_policy     74111 non-null  object 
 9   cleaning_fee            74111 non-null  bool   
 10  city                    74111 non-null  object 
 11  description             74111 non-null  object 
 12  first_review            58247 non-null  object 
 13  host_has_profile_pic    73923 non-null  object 
 14  host_identity_verified  73923 non-null

### Business Understanding

In [3]:
#unlogging the logged price data field for easier interpretation
#pdata['price'] = np.exp(pdata.log_price)
pdata = pdata.dropna()

In [4]:
#value_list = ["Apartment","House","Condominium","Loft","Townhouse"]
#value_list = ["Apartment","House"]
value_list = ["Apartment"]
boolean_series = ~pdata.property_type.isin(value_list)
filtered_df = pdata[boolean_series]

filtered_df.head(100)

for i in list (filtered_df.index):
    pdata.loc[i,'property_type'] = "other"

pdata["region"] = pd.cut(pdata.longitude,[-200,-100,0],2,labels=["West","East"])

In [None]:
pdata.head()

replaceStruct = {
                "cleaning_fee":     {True: 1, False: 0},
                "instant_bookable":     {"t": 1, "f": 0},
                "host_has_profile_pic":     {"t": 1, "f": 0},
                "host_identity_verified":     {"t": 1, "f": 0},
                 "property_type":     {"Apartment": 1, "House": 2, "other": 3},
                    }

pdata=pdata.replace(replaceStruct)
pdata.head()

In [5]:
replaceStruct = {
                "cleaning_fee":     {True: 1, False: 0},
                "instant_bookable":     {"t": 1, "f": 0},
                "host_has_profile_pic":     {"t": 1, "f": 0},
                "host_identity_verified":     {"t": 1, "f": 0},
                 "property_type":     {"Apartment": 1, "other": 2},
                    }

pdata=pdata.replace(replaceStruct)
pdata.head()

Unnamed: 0,id,price,log_price,property_type,room_type,accommodates,bathrooms,bed_type,cancellation_policy,cleaning_fee,...,last_review,latitude,longitude,neighbourhood,number_of_reviews,review_scores_rating,zipcode,bedrooms,beds,region
1,6304928,169.0,5.129899,1,Entire home/apt,7,1.0,Real Bed,strict,1,...,9/23/2017,40.766115,-73.98904,Hell's Kitchen,6,93.0,10019,3.0,3.0,East
2,7919400,145.0,4.976734,1,Entire home/apt,5,1.0,Real Bed,moderate,1,...,9/14/2017,40.80811,-73.943756,Harlem,10,92.0,10027,1.0,3.0,East
4,3808709,115.0,4.744932,1,Entire home/apt,2,1.0,Real Bed,moderate,1,...,1/22/2017,38.925627,-77.034596,Columbia Heights,4,40.0,20009,0.0,1.0,East
5,12422935,85.0,4.442651,1,Private room,2,1.0,Real Bed,strict,1,...,9/5/2017,37.753164,-122.429526,Noe Valley,3,100.0,94131,1.0,1.0,West
7,13971273,120.0,4.787492,2,Entire home/apt,2,1.0,Real Bed,moderate,1,...,4/12/2017,34.046737,-118.260439,Downtown,9,93.0,90015,1.0,1.0,West


In [None]:
pdata.nunique()

In [6]:


oneHotCols=["room_type","bed_type","neighbourhood","city","cancellation_policy","cleaning_fee","region"]
#oneHotCols=["room_type","bed_type","city","cancellation_policy","region","neighbourhood"]
#pdata.drop(['description','neighbourhood','host_response_rate','first_review','host_since','last_review','zipcode','id','latitude','longitude'], axis=1, inplace=True)
pdata.drop(['description','host_response_rate','first_review','host_since','last_review','zipcode','id','latitude','longitude'], axis=1, inplace=True)
pdata=pd.get_dummies(pdata, columns=oneHotCols)
pdata.head(10)

Unnamed: 0,price,log_price,property_type,accommodates,bathrooms,host_has_profile_pic,host_identity_verified,instant_bookable,number_of_reviews,review_scores_rating,...,city_SF,cancellation_policy_flexible,cancellation_policy_moderate,cancellation_policy_strict,cancellation_policy_super_strict_30,cancellation_policy_super_strict_60,cleaning_fee_0,cleaning_fee_1,region_West,region_East
1,169.0,5.129899,1,7,1.0,1,0,1,6,93.0,...,0,0,0,1,0,0,0,1,0,1
2,145.0,4.976734,1,5,1.0,1,1,1,10,92.0,...,0,0,1,0,0,0,0,1,0,1
4,115.0,4.744932,1,2,1.0,1,1,1,4,40.0,...,0,0,1,0,0,0,0,1,0,1
5,85.0,4.442651,1,2,1.0,1,1,1,3,100.0,...,1,0,0,1,0,0,0,1,1,0
7,120.0,4.787492,2,2,1.0,1,1,0,9,93.0,...,0,0,1,0,0,0,0,1,1,0
8,120.0,4.787492,2,2,1.0,1,0,0,159,99.0,...,1,0,1,0,0,0,0,1,1,0
10,100.0,4.60517,1,2,1.0,1,1,0,82,93.0,...,0,0,0,1,0,0,0,1,0,1
11,150.0,5.010635,2,4,1.5,1,1,0,29,97.0,...,0,0,0,1,0,0,0,1,1,0
13,200.0,5.298317,1,6,1.5,1,1,1,13,89.0,...,0,0,0,1,0,0,0,1,0,1
17,99.0,4.59512,2,2,2.0,1,1,0,12,88.0,...,0,0,0,1,0,0,0,1,0,1


In [None]:
print(pdata)

In [7]:

X = pdata.drop('property_type',axis=1)     
Y = pdata['property_type']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

x_train.head()

Unnamed: 0,price,log_price,accommodates,bathrooms,host_has_profile_pic,host_identity_verified,instant_bookable,number_of_reviews,review_scores_rating,bedrooms,...,city_SF,cancellation_policy_flexible,cancellation_policy_moderate,cancellation_policy_strict,cancellation_policy_super_strict_30,cancellation_policy_super_strict_60,cleaning_fee_0,cleaning_fee_1,region_West,region_East
36694,92.0,4.521789,2,1.0,1,1,0,153,99.0,1.0,...,0,1,0,0,0,0,0,1,1,0
50548,150.0,5.010635,3,1.0,1,1,0,10,100.0,1.0,...,0,0,1,0,0,0,0,1,0,1
7142,60.0,4.094345,2,1.0,1,1,0,34,92.0,1.0,...,0,0,1,0,0,0,1,0,0,1
36775,280.0,5.63479,2,1.0,1,0,1,3,87.0,0.0,...,0,1,0,0,0,0,1,0,0,1
65349,220.0,5.393628,4,1.0,1,1,0,3,100.0,1.0,...,0,0,1,0,0,0,0,1,0,1


In [8]:
from sklearn.preprocessing import StandardScaler
ScaleObj = StandardScaler()
ScaleObj.fit(x_train)

X_train_scaled = ScaleObj.transform(x_train)
X_test_scaled = ScaleObj.transform(x_test)

In [None]:
print("{0:0.2f}% data is in training set".format((len(x_train)/len(pdata.index)) * 100))
print("{0:0.2f}% data is in test set".format((len(x_test)/len(pdata.index)) * 100))

In [9]:

# Fit the model on train
modelFIRST = LogisticRegression(solver="liblinear")
modelFIRST.fit(X_train_scaled, y_train)
#predict on test
y_predict = modelFIRST.predict(X_test_scaled)


coef_df = pd.DataFrame(modelFIRST.coef_)
coef_df['intercept'] = modelFIRST.intercept_
print(coef_df)

          0         1         2         3         4         5         6  \
0  0.205204  0.279383 -0.003497  0.319713  0.002386 -0.038592 -0.014984   

          7         8         9  ...       615       616       617       618  \
0  0.101226  0.073043  0.110159  ... -0.008543  0.010876 -0.004949 -0.016471   

        619       620       621       622       623  intercept  
0  0.116089 -0.006857  0.006857  0.268395 -0.268395   -0.83368  

[1 rows x 625 columns]


In [None]:
coef_df.head()

In [10]:
model_score = modelFIRST.score(X_test_scaled, y_test)
print(model_score)

0.7691408533021625


In [None]:
modelFIRST.get_params()

In [None]:
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, roc_auc_score,accuracy_score
from sklearn.linear_model import LogisticRegression

## function to get confusion matrix in a proper format
def draw_cm( actual, predicted ):
    cm = confusion_matrix( actual, predicted)
    sns.heatmap(cm, annot=True,  fmt='.2f', xticklabels = [0,1] , yticklabels = [0,1] )
    plt.ylabel('Observed')
    plt.xlabel('Predicted')
    plt.show()

print("Trainig accuracy",modelFIRST.score(X_train_scaled,y_train))  
print()
print("Testing accuracy",modelFIRST.score(X_test_scaled, y_test))
print()
print('Confusion Matrix')
print(draw_cm(y_test,y_predict))


In [22]:
modelGRID = LogisticRegression()

parameters = {'penalty': ['l2'],
             'C': [.001,.75,10],
             'solver': ['newton-cg','liblinear','sag','saga'],
              'max_iter':[500,1000,5000]
            }

In [23]:
from sklearn.model_selection import GridSearchCV
gs = GridSearchCV(modelGRID,n_jobs=8,param_grid=parameters,cv=3,scoring='accuracy')
gs.fit(X_train_scaled, y_train)
gs.best_params_

{'C': 0.75, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'sag'}

In [24]:
gs.cv_results_['params']

[{'C': 0.75, 'max_iter': 500, 'penalty': 'l2', 'solver': 'newton-cg'},
 {'C': 0.75, 'max_iter': 500, 'penalty': 'l2', 'solver': 'liblinear'},
 {'C': 0.75, 'max_iter': 500, 'penalty': 'l2', 'solver': 'sag'},
 {'C': 0.75, 'max_iter': 500, 'penalty': 'l2', 'solver': 'saga'},
 {'C': 0.75, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'newton-cg'},
 {'C': 0.75, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'liblinear'},
 {'C': 0.75, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'sag'},
 {'C': 0.75, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'saga'},
 {'C': 10, 'max_iter': 500, 'penalty': 'l2', 'solver': 'newton-cg'},
 {'C': 10, 'max_iter': 500, 'penalty': 'l2', 'solver': 'liblinear'},
 {'C': 10, 'max_iter': 500, 'penalty': 'l2', 'solver': 'sag'},
 {'C': 10, 'max_iter': 500, 'penalty': 'l2', 'solver': 'saga'},
 {'C': 10, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'newton-cg'},
 {'C': 10, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'liblinear'},
 {'C': 10, 'max_iter': 1000, 'penalty': 'l2',

In [25]:
gs.cv_results_['mean_test_score']

array([0.77019278, 0.77039733, 0.77028045, 0.77030968, 0.77019278,
       0.77039733, 0.77042656, 0.77028045, 0.77016356, 0.77033889,
       0.77030967, 0.77025124, 0.77016356, 0.77033889, 0.77042656,
       0.77030967])

In [None]:
#Starting with L2
train_score=[]
test_score=[]
solver = ['newton-cg','liblinear','sag','saga']
for i in solver:
    model = LogisticRegression(random_state=42,penalty='l2', C = 0.75,solver=i)  # changing values of solver
    model.fit(X_train_scaled, y_train) 
    y_predict = model.predict(X_test_scaled)     
    train_score.append(round(model.score(X_train_scaled, y_train),3))
    test_score.append(round(model.score(X_test_scaled, y_test),3))
    
print(solver)
print()
print(train_score)
print()
print(test_score)

In [None]:
pdata.nunique()

In [None]:
from sklearn.svm import SVC
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_scaled, y_train)


In [None]:
#Evaluate the SVM Model
y_pred = svm_model.predict(X_test_scaled)
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))