# k-Nearest Neighbors

In [1]:
import warnings
warnings.filterwarnings('ignore')
%autosave 5

Autosaving every 5 seconds


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
%matplotlib inline

# Import iris data
iris = datasets.load_iris()

In [None]:
# Create dataframe to store iris data
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df['target'] = iris.target_names[iris.target]

iris_df.head()

In [None]:
# Import estimator
from sklearn.neighbors import KNeighborsClassifier
np.random.seed(0)   # setting random seed for reproducibility

# Create an instance of the estimator
flower_classifier = KNeighborsClassifier(n_neighbors=3)

In [None]:
# Train our model
flower_classifier.fit(X=iris.data, y=iris.target)

In [None]:
# Store measurements of new flowers in two-dimensional arrays
new_flower1 = np.array([[5.1, 3.0, 1.1, 0.5]])
new_flower2 = np.array([[6.0, 2.9, 4.5, 1.1]])

In [None]:
# Make predictions
new_flower_pred1 = flower_classifier.predict(new_flower1)
new_flower_pred2 = flower_classifier.predict(new_flower2)

print(f'Predicted species of new_flower1: {new_flower_pred1}')
print(f'Predicted species of new_flower2: {new_flower_pred2}')

In [None]:
# Make more predictions
new_flowers = np.array([[5.5, 2.1, 2.0, 0.9],
                        [7.2, 3.8, 9.0, 1.9]])

new_flowers_pred = flower_classifier.predict(new_flowers)

print(f'Predicted species of new flowers: {new_flowers_pred}')

In [None]:
# TASK --- Predict species of new flowers


# Logistic Regression

In [None]:
# Import data
data = pd.read_csv('student_alc.csv')

# Show first five rows
data.head()

In [None]:
# Store features and target variables as list
features = ['sex', 'famsize', 'age', 'studytime', 'famrel', 'goout', 'freetime', 'G3']
target = 'alc'

In [None]:
# One-hot encode categorical variables
data['sex'] = data['sex'].map({'F': 0, 'M': 1})
data['famsize'] = data['famsize'].map({'LE3': 0, 'GT3': 1})
data['alc'] = data['alc'].map({'Low': 0, 'High': 1})

In [None]:
# Save feature and target values into X and y
X = data[features].values
y = data[target].values

In [None]:
# Calculate a benchmark
data['alc'].value_counts(normalize=True)

In [None]:
# Import necessary model
from sklearn.linear_model import LogisticRegression
np.random.seed(0)   # setting random seed for reproducibility

# Create instance of Logistic Regression object
student_classifier_logreg = LogisticRegression(C=2, solver='liblinear')

# Train logistic regression model
student_classifier_logreg.fit(X, y)

In [None]:
# Store predictions in new column in DataFrame
data['predictions_logreg'] = student_classifier_logreg.predict(X)

# Create confusion matrix
conf_matrix = pd.crosstab(data['predictions_logreg'], data['alc'])
conf_matrix

In [None]:
# Calculate accuracy from confusion matrix
accuracy = (conf_matrix.iloc[0, 0] + conf_matrix.iloc[1, 1]) / data.shape[0]

print(f"The model's accuracy was {(accuracy * 100):.4f}%")

# Random Forests

In [None]:
# Import RandomForestClassifier object
from sklearn.ensemble import RandomForestClassifier
np.random.seed(0)   # setting random seed for reproducibility

In [None]:
# TASK --- Create instance of object and train random forest model


In [None]:
# Store predictions in new column in DataFrame
data['predictions_rf'] = student_classifier_rf.predict(X)

# Create confusion matrix
conf_matrix = pd.crosstab(data['predictions_rf'], data['alc'])
conf_matrix

In [None]:
# TASK --- Calculate accuracy


In [None]:
# Features: [gender, famsize, age, studytime, famrel, goout, freetime, G3]
new_student1 = np.array([[1, 1, 18, 2, 1, 5, 5, 10]])

# Make prediction
new_student1_pred = student_classifier_rf.predict(new_student1)

# Decode prediction
def decode(prediction):
    """
    Return 'High' if prediction is '1'
    Return 'Low' if prediction is '0'.
    """
    if prediction == 1:
        return('High')
    else:
        return('Low')

decode(new_student1_pred)

In [None]:
# New students!
new_student2 = np.array([[0, 1, 19, 5, 5, 1, 1, 18]])
new_student3 = np.array([[1, 0, 17, 2, 0, 3, 1, 8]])

In [None]:
# TASK --- Predictions alcohol consumption level for new_student2


In [None]:
# TASK --- Predictions alcohol consumption level for new_student3


# Linear Regression

In [None]:
# Import data
housing = pd.read_csv('house.csv')

# Create 'Age' variable as in 'EDA with `seaborn`' lesson
housing['Age'] = housing['YrSold'] - housing['YearBuilt']

# Show first five rows
housing.head()

In [None]:
# Remove neighborhoods with less than 30 observations
counts = housing['Neighborhood'].value_counts()
housing = housing.loc[housing['Neighborhood'].isin(counts[counts > 30].index)]

housing.shape

In [None]:
# Store feature and target variables
features = ['CentralAir', 'LotArea', 'OverallQual', 'OverallCond', '1stFlrSF', '2ndFlrSF', 'BedroomAbvGr', 'Age']
target = 'SalePrice'

In [None]:
# One-hot encode CentralAir
housing['CentralAir'] = housing['CentralAir'].map({'N':0, 'Y':1})

In [None]:
# One-hot encode Neighborhood
dummies_nb = pd.get_dummies(housing['Neighborhood'])
dummies_nb.head()

In [None]:
# Compare dummy variables to original neighborhood data
housing['Neighborhood'].head()

In [None]:
# Join dummy variables to original dataset
housing = pd.concat([housing, dummies_nb], axis=1)
housing.head()

In [None]:
# Add dummy variables to feature list
features += list(dummies_nb.columns)
features

In [None]:
# Store features, target, and number of observations
X = housing[features].values
y = housing[target].values
n = housing.shape[0]

In [None]:
# Calculate mean sale price for benchmark
y_mean = np.mean(y)
y_mean

In [None]:
# Calculate benchmark RMSE
RMSE_benchmark = np.sqrt(np.sum((y - y_mean)**2) / n)
RMSE_benchmark

In [None]:
# Build linear regression model
from sklearn.linear_model import LinearRegression
np.random.seed(0)   # setting random seed for reproducibility

lr = LinearRegression()
lr.fit(X, y)

# Make predictions
housing['predictions'] = lr.predict(X)
y_pred = housing['predictions'].values

In [None]:
# Calculate RMSE for linear regression model
RMSE_lr = np.sqrt(np.sum((y - y_pred)**2) / n)
RMSE_lr

In [None]:
# Create scatter to compare predictions to actual sale prices
## Set style
sns.set(style='white', palette='deep', rc={'figure.figsize':[8, 8]})

## Plot scatterplot for predictions and actual prices
fig, ax = plt.subplots()
ax.scatter(x=housing['SalePrice'], y=housing['predictions'], edgecolor='white')
ax.set_title('Predicted House Price vs Actual House Price')
ax.set_xlabel('Actuals')
ax.set_ylabel('Predictions')

## Plot gray dotted diagonal line (where markers should lie if predictions = actual)
lims = [
    np.min([ax.get_xlim(), ax.get_ylim()]),   # find min limits of figure
    np.max([ax.get_xlim(), ax.get_ylim()])    # find max limits of figure
]
ax.plot(lims, lims, color='0.5', linestyle='--')

## Remove top and right spines
sns.despine()
plt.show()

In [None]:
# TASK --- Predict price of new_house1
## Remember, features are ['CentralAir', 'LotArea', 'OverallQual', 'OverallCond',
## '1stFlrSF', '2ndFlrSF', 'BedroomAbvGr', 'Age', 'Neighborhood']


In [None]:
# TASK --- Predict price of new_house2
