# Regression Analysis

### Step 1: Importing Libraries

In [1]:
#Importing all relevant libraries

import numpy as np
import pandas as pd
import scipy.stats as stats
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

### Step 2: Cleaning Control Variables

In [2]:
#Load listings dataset

df = pd.read_csv('C:/Users/chris/Desktop/Listings+Frequency_Reduced.csv')

#Cleaning variables

#Bedrooms
df['bedrooms'].fillna(int(df['bedrooms'].mean()), inplace = True)

#Price
df['price'] = df['price'].str.replace(',','')
df['price'] = df['price'].str.replace('$','')
df['price'] = df.price.astype(float)

#Bathrooms
df['bathrooms_text'] = df['bathrooms_text'].str.replace('[abcdefghijklmnopqrstuvwxyzPHS-]','')
df['bathrooms_text'] = df['bathrooms_text'].str.replace(' ','')
df = df[df.bathrooms_text != '']
df['bathrooms_text'] = df.bathrooms_text.astype(float)
df['bathrooms_text'].fillna(int(df['bathrooms_text'].mean()), inplace = True)


  df['price'] = df['price'].str.replace('$','')
  df['bathrooms_text'] = df['bathrooms_text'].str.replace('[abcdefghijklmnopqrstuvwxyzPHS-]','')


In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,Topic 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,0,2601191,1,5,15,15,10,17,10,19,...,5.0,4.8,4.6,,f,1,1,0,0,0.06
1,1,10805394,3,6,15,18,8,19,8,18,...,4.4,4.6,4.4,,f,1,0,1,0,0.14
2,2,51465104,3,9,14,13,11,17,6,18,...,5.0,4.75,4.75,,f,1,1,0,0,1.18
3,3,49286229,3,4,10,17,12,15,5,15,...,5.0,5.0,4.5,,f,1,1,0,0,0.64
4,4,30439280,5,10,12,16,12,17,7,17,...,5.0,4.67,4.67,,t,1,1,0,0,0.08


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24251 entries, 0 to 24302
Data columns (total 96 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   Unnamed: 0                                    24251 non-null  int64  
 1   id                                            24251 non-null  int64  
 2   Topic 0                                       24251 non-null  int64  
 3   Topic 1                                       24251 non-null  int64  
 4   Topic 2                                       24251 non-null  int64  
 5   Topic 3                                       24251 non-null  int64  
 6   Topic 4                                       24251 non-null  int64  
 7   Topic 5                                       24251 non-null  int64  
 8   Topic 6                                       24251 non-null  int64  
 9   Topic 7                                       24251 non-null 

### Step 3: Defining X and Y variables

In [5]:

X = df[["Topic 0","Topic 1","Topic 2","Topic 3","Topic 4",
        "Topic 5","Topic 6","Topic 7","Topic 8","Topic 9",
        "Topic 10","Topic 11","Topic 12","Topic 13","Topic 14",
        "Topic 15","Topic 16","Topic 17","Topic 18","Topic 19",
        "bedrooms","price","bathrooms_text"]]
y = df["review_scores_rating"]


### Step 4: Running Linear Regression

In [6]:
#Conduct regression on independent variables (topics, bedrooms, bathrooms, price) 
#& dependent variable (review_score_rating)


regr = linear_model.LinearRegression()
regr.fit(X, y)

LinearRegression()

In [7]:
regr.coef_

array([-4.70395727e-03,  2.72700227e-04, -5.63342836e-04,  1.29928867e-03,
        1.84548853e-04, -1.02855275e-03, -2.78083957e-03,  1.79473238e-03,
        1.13019636e-03,  2.91891920e-04, -4.30866485e-04, -1.36883831e-03,
        1.07631256e-03, -1.16969750e-03,  1.54822320e-03, -8.18279503e-04,
       -2.77366383e-04,  7.56767540e-05, -1.48344805e-03,  1.06137673e-04,
       -5.65533929e-03, -7.61808805e-07,  6.95693231e-03])

In [8]:
#Installing Stats Model library for better respresentation of Regression Results

import statsmodels.api as sm
from scipy import stats

In [9]:
est = sm.OLS(y, X)

In [10]:
est.fit().summary()

0,1,2,3
Dep. Variable:,review_scores_rating,R-squared (uncentered):,0.87
Model:,OLS,Adj. R-squared (uncentered):,0.87
Method:,Least Squares,F-statistic:,7045.0
Date:,"Wed, 16 Mar 2022",Prob (F-statistic):,0.0
Time:,18:39:08,Log-Likelihood:,-47425.0
No. Observations:,24251,AIC:,94900.0
Df Residuals:,24228,BIC:,95080.0
Df Model:,23,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Topic 0,-0.0102,0.001,-9.035,0.000,-0.012,-0.008
Topic 1,-0.0017,0.001,-1.436,0.151,-0.004,0.001
Topic 2,0.0011,0.001,1.137,0.255,-0.001,0.003
Topic 3,-0.0050,0.001,-4.214,0.000,-0.007,-0.003
Topic 4,-0.0137,0.001,-10.911,0.000,-0.016,-0.011
Topic 5,0.0068,0.001,6.409,0.000,0.005,0.009
Topic 6,0.0057,0.001,5.053,0.000,0.003,0.008
Topic 7,0.0078,0.001,6.094,0.000,0.005,0.010
Topic 8,-0.0011,0.001,-0.814,0.415,-0.004,0.001

0,1,2,3
Omnibus:,18266.978,Durbin-Watson:,1.569
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1158326.515
Skew:,-3.065,Prob(JB):,0.0
Kurtosis:,36.298,Cond. No.,1060.0
