# Chapter 9

## 9.1.2

In [None]:
import statsmodels.api as sm
import pandas as pd
housing_prices = pd.read_csv("housing_prices.csv")    #A

X = housing_prices[["sq_footage", "n_bedrooms"]]
y = housing_prices["price"]    #B
X = sm.add_constant(X)    #C 

model = sm.OLS(y, X).fit()    #D
print(model.summary())

#A Import statsmodels, pandas, and the dataframe
#B Create an X and y variable for the predictors and outcome, respectively
#C Add a constant representing the y-intercept
#D Print the model results

In [None]:
housing_prices["residuals"] = model.resid
housing_prices["predicted_values"] = model.predict()    #A

plt.scatter(
    housing_prices["predicted_values"],
    housing_prices["residuals"]
)    #B
plt.axhline(y=0, color= "black", linestyle="-")    #C

#A Save the residuals and predicted values as new columns in the dataframe
#B Create a scatterplot of the predicted values against the residuals
#C Add a horizontal line at 0 to better display the residuals centered around 0

In [None]:
plt.hist(housing_prices["residuals"], bins=30)    #A
#A Plot a histogram of residuals from the housing prices model

# 9.2

In [5]:
import pandas as pd    #A

rats = pd.read_csv("rat_sightings.csv")    #B
weather = pd.read_csv("weather.csv")

rats_weather = pd.merge(
    weather, 
    rats, 
    on="day", 
    how="left"
)  #C

rats_weather.corr().round(2)    #D

#A Import pandas
#B Import the rats and weather dataframes 
#C Merge the dataframes and fill in missing values
#D Generate correlations

Unnamed: 0,high_temp,low_temp,humidity,wind_speed,precip,rat_sightings
high_temp,1.0,0.96,0.15,-0.23,-0.04,0.6
low_temp,0.96,1.0,0.18,-0.26,-0.03,0.62
humidity,0.15,0.18,1.0,0.03,0.23,0.15
wind_speed,-0.23,-0.26,0.03,1.0,0.21,-0.24
precip,-0.04,-0.03,0.23,0.21,1.0,-0.03
rat_sightings,0.6,0.62,0.15,-0.24,-0.03,1.0


In [12]:
rats_weather["day"] = pd.to_datetime(rats_weather["day"], format='%m/%d/%y')
rats_weather["dow"] = rats_weather["day"].dt.dayofweek    #A
rats_weather["month_num"] = rats_weather["day"].dt.month    #B
rats_weather.head()    #C

#A Add a column with the day of the week number (from 0 to 6) starting with Monday
#B Add a column with the month number (from 1 through 12) in a calendar year 
#C Display the first 5 rows of the dataset

Unnamed: 0,day,high_temp,low_temp,humidity,wind_speed,precip,rat_sightings,dow,month_num
0,2018-01-01,19.0,8.0,67.0,22.0,0.0,15,0,1
1,2018-01-02,26.0,14.0,59.0,21.0,0.0,36,1,1
2,2018-01-03,30.0,18.0,53.0,16.0,0.0,36,2,1
3,2018-01-04,29.0,20.0,92.0,37.0,0.02,14,3,1
4,2018-01-05,19.0,11.0,56.0,31.0,6.54,18,4,1


In [None]:
import seaborn as sns    #A
sns.boxplot(data=rats_weather, x="month_number", y="rat_sightings")    #B

#A Import seaborn library
#B Create boxplots to visualize the distributions of rat sightings by month number

In [None]:
sns.boxplot(data=rats_weather, x="dow", y="rat_sightings")    #A

#A Create boxplots to visualize the distributions of rat sightings by month number

In [None]:
rats_weather["weekday"] = (rats_weather["dow"]<5).astype(int)    #A
rats_weather[
    ["high_temp", "wind_speed", "weekday", "rat_sightings"]
].corr()    #B

#A Convert the day of the week into a binary weekday column
#B Recalculate correlations with the new dummy variable

In [None]:
sns.pairplot(
    rats_weather[["high_temp", "wind_speed", "weekday", "rat_sightings"]]
)  # A

#A Generate a pairplot of all relevant variables in the dataframe

In [None]:
import numpy as np    #A

rats_weather["wind_speed_sq"] = rats_weather["wind_speed "]**2
rats_weather["wind_speed_sqrt"] = np.sqrt(rats_weather["wind_speed"])    #B

rats_weather[
    ["wind_speed_sq", " wind_speed_sqrt", "wind_speed", "rat_sightings"]
].corr()   #C

#A Import the numpy library
#B Create new columns with the square and square root of the wind speed column 
#C Compare Pearson's correlations

## 9.2.2

In [None]:
X = rats_weather[["high_temp", "wind_speed", "weekday"]]
y = rats_weather["rat_sightings"]    #A
X = sm.add_constant(X)    #B

model = sm.OLS(y, X).fit()    #C
print(model.summary()) 

#A Create an X and y variable for the predictors and outcome, respectively
#B Add a constant to represent the y-intercept
#C Print the model results

In [None]:
rats_weather["residuals"] = model.resid
rats_weather["predicted_values"] = model.predict()    #A

plt.scatter(
    rats_weather["predicted_values"],
    rats_weather["residuals"], color="gray", s=5
)    #B
plt.axhline(y=0, color="black", linestyle="-")    #C

#A Save the residuals and predicted values as new columns in rats_weather
#B Create a scatterplot of the predicted values against the residuals
#C Add a horizontal line at 0 to display the residuals centered around 0

In [None]:
rats_weather[rats_weather["predicted_values"]<10]    #A

#A Filter the dataset to display only records with low predicted values

In [None]:
rats_weather = rats_weather[rats_weather["high_temp"]!=0]    #A

#A Remove the row with incorrect weather data

In [None]:
X = rats_weather[["high_temp", "wind_speed", "weekday"]]
y = np.sqrt(rats_weather["rat_sightings"])    #A
X = sm.add_constant(X)

model = sm.OLS(y, X).fit()
print(model.summary())    #B

#A Take the square root of the number of rat sightings as the new y-variable
#B Fit the model again and show the results

# 9.3

In [None]:
rats_weather["high_precip"] = (
rats_weather["precip"] > 0.1
).astype(int)    #A

rats_weather[
    ["high_temp", "wind_speed", "weekday", "high_precip", "rat_sightings"]
].corr()    #B

#A Create a Boolean column indicating whether precipitation is higher than 0.1 inches
#B Re-examine correlations

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression    #A

X = rats_weather[["high_temp", "wind_speed", "weekday"]]
y = np.sqrt(rats_weather["rat_sightings"])    #B

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=99
)    #C

model = LinearRegression()
model.fit(X_train, y_train)    #D

#A Import libraries
#B Split the data into its X and y variables
#C Randomly split the X and y data into a training and test set, with 80% of data for training
#D Fit the linear regression model to the training data

In [None]:
from sklearn.metrics import mean_squared_error    #A

y_pred_train = model.predict(X_train)
y_pred = model.predict(X_test)    #B

r2_train = model.score(X_train, y_train) 
r2_test = model.score(X_test, y_test)    #C

rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))    #D

print(f"Training Set RMSE: {rmse_train}")
print(f"Test Set RMSE: {rmse_test}")
print(f"Training Set R-squared: {r2_train}")
print(f"Test Set R-squared: {r2_test}")

#A Import mean squared error evaluation metric
#B Generate rat sighting predictions based on the X training and test set inputs
#C Calculate the R2 value for both the training and the test set
#D Calculate the mean squared error (MSE) and root mean squared error (RMSE) for the predictions