# Lab 2

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load the data
housing = pd.read_csv('https://raw.githubusercontent.com/thomouvic/SENG474/main/data/housing.csv')
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [8]:
# Check for any invalid values
housing.isna().sum()
# this meas there's 207 null values in bedrooms, but that's it
# We will need to fix this at some point, replace them with median values

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,rooms_per_house,bedrooms_ratio,population_per_house
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909,5.429,0.213039,3.070655
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874,2.474173,0.057983,10.38605
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0,0.846154,0.1,0.692308
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0,4.440716,0.175427,2.429741
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0,5.229129,0.203162,2.818116
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0,6.052381,0.239821,3.282261
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0,141.909091,1.0,1243.333333


# Attribute Combinations

In [6]:
# Experimenting with attribute combinations
housing["rooms_per_house"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_ratio"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_house"]=housing["population"]/housing["households"]

In [7]:
# Make a correlation matrix and see how our new features correlate to what we want to predict
corr_matrix = housing.corr(numeric_only=True)
corr_matrix["median_house_value"].sort_values(ascending=False)

median_house_value      1.000000
median_income           0.688075
rooms_per_house         0.151948
total_rooms             0.134153
housing_median_age      0.105623
households              0.065843
total_bedrooms          0.049686
population_per_house   -0.023737
population             -0.024650
longitude              -0.045967
latitude               -0.144160
bedrooms_ratio         -0.255880
Name: median_house_value, dtype: float64

# Prepare data for ML

In [10]:
# Reload the original data
housing = pd.read_csv('https://raw.githubusercontent.com/thomouvic/SENG474/main/data/housing.csv')

# Create the stratified sampling of the data based on income categories
from sklearn.model_selection import train_test_split

housing["income_cat"] = pd.cut(housing["median_income"], 
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf], 
                               labels=[1, 2, 3, 4, 5]) #separates median_income into discrete categories

strat_train_set, strat_test_set = train_test_split(housing, test_size=0.2, random_state=42,
                                                   stratify=housing["income_cat"]) # stratify ensures proportions of income category
                                                                                   # between training and test sets

# Drop income_cat column. We won't be needing it anymore. It was only for sorting the data to split
strat_train_set.drop("income_cat", axis=1, inplace=True)
strat_test_set.drop("income_cat", axis=1, inplace=True)

# Split the data frames into features and labels
# Note that features are still Data Frame objects but the labels are Series objects
housing_train = strat_train_set.drop("median_house_value", axis=1)
housing_labels_train = strat_train_set["median_house_value"].copy()

housing_test = strat_test_set.drop("median_house_value", axis=1)
housing_labels_test = strat_test_set["median_house_value"].copy()

print(housing_train.head())
print("\n")
print(housing_labels_train.head())
print("\n")
print(housing_test.head())
print("\n")
print(housing_labels_test.head())


       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
13096    -122.42     37.80                52.0       3321.0          1115.0   
14973    -118.38     34.14                40.0       1965.0           354.0   
3785     -121.98     38.36                33.0       1083.0           217.0   
14689    -117.11     33.75                17.0       4174.0           851.0   
20507    -118.15     33.77                36.0       4366.0          1211.0   

       population  households  median_income ocean_proximity  
13096      1576.0      1034.0         2.0987        NEAR BAY  
14973       666.0       357.0         6.0876       <1H OCEAN  
3785        562.0       203.0         2.4330          INLAND  
14689      1845.0       780.0         2.2618          INLAND  
20507      1912.0      1172.0         3.5292      NEAR OCEAN  


13096    458300.0
14973    483800.0
3785     101700.0
14689     96100.0
20507    361800.0
Name: median_house_value, dtype: float64


       longi

### Clean the Data
Most machine learning algorithms cannot work with missing features, so you’ll need to take
care of these. Imputation is a common way of filling in missing values.

In [11]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median") # replaces NA values with median values

In [15]:
# Make a numerical only version of the training data and fit the imputer to that
housing_num = housing_train.select_dtypes(include=[np.number])
imputer.fit(housing_num)
# this drops ocean_proximity feature

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
13096,-122.42,37.8,52.0,3321.0,1115.0,1576.0,1034.0,2.0987,NEAR BAY
14973,-118.38,34.14,40.0,1965.0,354.0,666.0,357.0,6.0876,<1H OCEAN
3785,-121.98,38.36,33.0,1083.0,217.0,562.0,203.0,2.433,INLAND
14689,-117.11,33.75,17.0,4174.0,851.0,1845.0,780.0,2.2618,INLAND
20507,-118.15,33.77,36.0,4366.0,1211.0,1912.0,1172.0,3.5292,NEAR OCEAN


In [17]:
# As you can see we have fit the imputer to the median of the training data
print(housing_num.median(), "\n")
print(imputer.statistics_)

longitude             -118.5100
latitude                34.2600
housing_median_age      29.0000
total_rooms           2125.0000
total_bedrooms         434.0000
population            1167.0000
households             408.0000
median_income            3.5385
dtype: float64 

[-118.51     34.26     29.     2125.      434.     1167.      408.
    3.5385]


In [18]:
# Finally lets take our numerical training data and impute the missing values
X = imputer.transform(housing_num)

# This transforms our data frame into an numpy array of feature vectors with imputation
print(X.shape)

# For comparision
print(housing_num.head(1))
print(X[0])

(16512, 8)
       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
13096    -122.42      37.8                52.0       3321.0          1115.0   

       population  households  median_income  
13096      1576.0      1034.0         2.0987  
[-1.2242e+02  3.7800e+01  5.2000e+01  3.3210e+03  1.1150e+03  1.5760e+03
  1.0340e+03  2.0987e+00]


In [19]:
# Assuming our data was nice we could now convert our labels and begin working with models
y = np.asarray(housing_labels_train)

print(X[:5])
print(y[:5])

[[-1.2242e+02  3.7800e+01  5.2000e+01  3.3210e+03  1.1150e+03  1.5760e+03
   1.0340e+03  2.0987e+00]
 [-1.1838e+02  3.4140e+01  4.0000e+01  1.9650e+03  3.5400e+02  6.6600e+02
   3.5700e+02  6.0876e+00]
 [-1.2198e+02  3.8360e+01  3.3000e+01  1.0830e+03  2.1700e+02  5.6200e+02
   2.0300e+02  2.4330e+00]
 [-1.1711e+02  3.3750e+01  1.7000e+01  4.1740e+03  8.5100e+02  1.8450e+03
   7.8000e+02  2.2618e+00]
 [-1.1815e+02  3.3770e+01  3.6000e+01  4.3660e+03  1.2110e+03  1.9120e+03
   1.1720e+03  3.5292e+00]]
[458300. 483800. 101700.  96100. 361800.]


# Transformation Pipelines
We still have more preproccessing of the data we would like to perform, so in order to make things more managable, we should make a transformation pipeline that can apply multiple transformations in sequence.

In [20]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

num_pipeline = Pipeline([("imputer", SimpleImputer(strategy="median")), ("standardize", StandardScaler())])

# If you don't want to name transformers you can do it this way as well
#num_pipeline = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())
num_pipeline


In [68]:
# Apply the transformations via the pipeline and view the results
num_pipeline.fit(housing_num)
housing_num_tr = num_pipeline.transform(housing_num)

# fit_transform() is a convenience function that calls the above two functions in order
housing_num_tr = num_pipeline.fit_transform(housing_num)

print(housing_num_tr[:5])

[[-1.42303652  1.0136059   1.86111875  0.31191221  1.35909429  0.13746004
   1.39481249 -0.93649149]
 [ 0.59639445 -0.702103    0.90762971 -0.30861991 -0.43635598 -0.69377062
  -0.37348471  1.17194198]
 [-1.2030985   1.27611874  0.35142777 -0.71224036 -0.75958421 -0.78876841
  -0.77572662 -0.75978881]
 [ 1.23121557 -0.88492444 -0.91989094  0.70226169  0.73623112  0.38317548
   0.73137454 -0.85028088]
 [ 0.71136206 -0.87554898  0.58980003  0.79012465  1.58558998  0.44437597
   1.75526303 -0.18036472]]


In [22]:
# Convert numpy array back to a data frame and print the head
df_housing_num_tr = pd.DataFrame(housing_num_tr, columns=housing_num.columns)
print(df_housing_num_tr.head())

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0  -1.423037  1.013606            1.861119     0.311912        1.368167   
1   0.596394 -0.702103            0.907630    -0.308620       -0.435925   
2  -1.203098  1.276119            0.351428    -0.712240       -0.760709   
3   1.231216 -0.884924           -0.919891     0.702262        0.742306   
4   0.711362 -0.875549            0.589800     0.790125        1.595753   

   population  households  median_income  
0    0.137460    1.394812      -0.936491  
1   -0.693771   -0.373485       1.171942  
2   -0.788768   -0.775727      -0.759789  
3    0.383175    0.731375      -0.850281  
4    0.444376    1.755263      -0.180365  


In [23]:
# Compare the two data frames
print(housing_num.head())
print()
print(df_housing_num_tr.head())

       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
13096    -122.42     37.80                52.0       3321.0          1115.0   
14973    -118.38     34.14                40.0       1965.0           354.0   
3785     -121.98     38.36                33.0       1083.0           217.0   
14689    -117.11     33.75                17.0       4174.0           851.0   
20507    -118.15     33.77                36.0       4366.0          1211.0   

       population  households  median_income  
13096      1576.0      1034.0         2.0987  
14973       666.0       357.0         6.0876  
3785        562.0       203.0         2.4330  
14689      1845.0       780.0         2.2618  
20507      1912.0      1172.0         3.5292  

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0  -1.423037  1.013606            1.861119     0.311912        1.368167   
1   0.596394 -0.702103            0.907630    -0.308620       -0.435925   
2  -1.203098  1.2

# STUDENT SECTION

In [24]:
# Load the data set and print the first 5 entries
# https://scikit-learn.org/stable/datasets/toy_dataset.html#diabetes-dataset
from sklearn.datasets import load_diabetes
data = load_diabetes(as_frame=True, scaled = False)
diabetes = data['frame']
diabetes.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,59.0,2.0,32.1,101.0,157.0,93.2,38.0,4.0,4.8598,87.0,151.0
1,48.0,1.0,21.6,87.0,183.0,103.2,70.0,3.0,3.8918,69.0,75.0
2,72.0,2.0,30.5,93.0,156.0,93.6,41.0,4.0,4.6728,85.0,141.0
3,24.0,1.0,25.3,84.0,198.0,131.4,40.0,5.0,4.8903,89.0,206.0
4,50.0,1.0,23.0,101.0,192.0,125.4,52.0,4.0,4.2905,80.0,135.0


In [26]:
# Check for any invalid values
diabetes.isna().sum()
# no null values

age       0
sex       0
bmi       0
bp        0
s1        0
s2        0
s3        0
s4        0
s5        0
s6        0
target    0
dtype: int64

# Attribute Combinations

In [55]:
# Experiment with different attribute combinations (Please make at least 3)
# You are welcome to experiment with these however you wish
# Just don't include target as an attribute as it is what we are trying to predict
diabetes["Combination_1"] = diabetes["age"]/diabetes["s4"]
diabetes["Combination_2"] = diabetes["bmi"]/diabetes["s3"]
diabetes["Combination_3"] = diabetes["bp"]/diabetes["s3"]

target           1.000000
bmi              0.586450
s5               0.565883
Combination_2    0.561842
Combination_3    0.512720
bp               0.441482
s4               0.430453
s6               0.382483
s1               0.212022
age              0.187889
s2               0.174054
sex              0.043062
Combination_1   -0.220875
s3              -0.394789
Name: target, dtype: float64

In [56]:
# Make a correlation matrix and see how our new features correlate to what we want to predict ("target")
corr_matrix = diabetes.corr(numeric_only=True)
corr_matrix["target"].sort_values(ascending=False)

target           1.000000
bmi              0.586450
s5               0.565883
Combination_2    0.561842
Combination_3    0.512720
bp               0.441482
s4               0.430453
s6               0.382483
s1               0.212022
age              0.187889
s2               0.174054
sex              0.043062
Combination_1   -0.220875
s3              -0.394789
Name: target, dtype: float64

# Prepare data for ML

In [58]:
# Reload the original data
data = load_diabetes(as_frame=True, scaled = False)
diabetes = data['frame']

# Create stratified train and test sets as in Lab 1
diabetes["bmi_cat"] = pd.cut(diabetes["bmi"],
                               bins=[0., 18.5, 25., 30., np.inf],
                               labels=[1, 2, 3, 4])

strat_train_set, strat_test_set = train_test_split(diabetes, test_size=0.2, random_state=42,
                                                   stratify=diabetes["bmi_cat"])

# Drop the bmi_cat column once the training and test sets have been made
strat_train_set.drop("bmi_cat", axis=1, inplace=True)
strat_test_set.drop("bmi_cat", axis=1, inplace=True)

# Split the data frames into features and labels
diabetes_train = strat_train_set.drop("target", axis=1, inplace=False)
diabetes_label_train = strat_train_set["target"].copy()

diabetes_test = strat_test_set.drop("target", axis=1, inplace=False)
diabetes_label_test = strat_test_set["target"].copy()

# Print the head of each data frame
# Note that features are still Data Frame objects but the labels are Series objects
print(diabetes_train.head(), "\n")
print (diabetes_label_train.head(), "\n\n\n")

print(diabetes_test.head(), "\n")
print (diabetes_label_test.head())

      age  sex   bmi      bp     s1     s2    s3    s4      s5     s6
70   48.0  1.0  19.9   91.00  189.0  109.6  69.0  3.00  3.9512  101.0
180  38.0  2.0  26.8  105.00  181.0  119.2  37.0  5.00  4.8203   91.0
199  60.0  1.0  22.2  104.67  221.0  105.4  60.0  3.68  5.6276   93.0
349  49.0  2.0  21.0   82.00  119.0   85.4  23.0  5.00  3.9703   74.0
426  57.0  2.0  23.2  107.33  231.0  159.4  41.0  5.63  5.0304  112.0 

70      48.0
180    107.0
199     90.0
349     88.0
426    120.0
Name: target, dtype: float64 



      age  sex   bmi     bp     s1     s2    s3   s4      s5     s6
161  36.0  2.0  32.3  115.0  286.0  199.4  39.0  7.0  5.4723  112.0
266  34.0  2.0  20.6   98.0  183.0   92.0  83.0  2.0  3.6889   92.0
262  44.0  2.0  38.2  123.0  201.0  126.6  44.0  5.0  5.0239   92.0
290  65.0  2.0  33.5  102.0  190.0  126.2  35.0  5.0  4.9698  102.0
424  49.0  2.0  27.4   89.0  177.0  113.0  37.0  5.0  4.9053   97.0 

161    217.0
266     45.0
262    308.0
290    332.0
424    111.0
Name:

# Transformation Pipelines

In [63]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

# Create a pipeline with a StandardScaler and print the pipeline
# all numerical and no missing values, do not need to impute

num_pipeline= Pipeline([("standardize", StandardScaler())])
num_pipeline

In [82]:
# Transform the training set using the new pipeline and print the first 5 values of the results
diabetes_num_tr = num_pipeline.fit_transform(diabetes_train)
# this turns the dataframe into a numpy array
print(diabetes_num_tr[:5])

[[ 0.01692276 -0.93683487 -1.44231005 -0.24835229 -0.01731729 -0.186248
   1.45956808 -0.82553249 -1.35869016  0.84297513]
 [-0.74894059  1.06742397  0.0908289   0.76322514 -0.25130788  0.13480456
  -0.97703593  0.71340376  0.29404175 -0.02508269]
 [ 0.93595877 -0.93683487 -0.93126373  0.73938082  0.91864505 -0.3267085
   0.7742732  -0.30229416  1.82925112  0.14852887]
 [ 0.09350909  1.06742397 -1.19789659 -0.89865207 -2.06473491 -0.995568
  -2.04305018  0.71340376 -1.32236847 -1.50078098]
 [ 0.70619977  1.06742397 -0.70906968  0.93158053  1.21113328  1.47921216
  -0.67246043  1.19816868  0.69358032  1.79783873]]


In [89]:
# Convert the numpy array back to a data frame and print the head
df_diabetes_num_tr = pd.DataFrame(diabetes_num_tr, columns=diabetes_train.columns)

df_diabetes_num_tr.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.016923,-0.936835,-1.44231,-0.248352,-0.017317,-0.186248,1.459568,-0.825532,-1.35869,0.842975
1,-0.748941,1.067424,0.090829,0.763225,-0.251308,0.134805,-0.977036,0.713404,0.294042,-0.025083
2,0.935959,-0.936835,-0.931264,0.739381,0.918645,-0.326708,0.774273,-0.302294,1.829251,0.148529
3,0.093509,1.067424,-1.197897,-0.898652,-2.064735,-0.995568,-2.04305,0.713404,-1.322368,-1.500781
4,0.7062,1.067424,-0.70907,0.931581,1.211133,1.479212,-0.67246,1.198169,0.69358,1.797839
