Working with Neural Network Models

&copy; Hans Nieminen, Satakunta University of Applied Sciences

# Exercise 1.1

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer, KNNImputer

In [None]:
data = {"A":[1,4,7,10,np.nan,16,18,20],
        "B":[2,np.nan,10,11,14,17,12,np.nan],
        "C":[6,6,9,12,15,np.nan,12,23],
        "D":[4,4,5,np.nan,5,2,5,2]}
df = pd.DataFrame(data)
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,6.0,4.0
1,4.0,,6.0,4.0
2,7.0,10.0,9.0,5.0
3,10.0,11.0,12.0,
4,,14.0,15.0,5.0
5,16.0,17.0,,2.0
6,18.0,12.0,12.0,5.0
7,20.0,,23.0,2.0


In [None]:
imputerA = SimpleImputer(missing_values=np.nan, strategy='mean')
imputerB = KNNImputer(n_neighbors=2)
imputerC = KNNImputer(n_neighbors=3)
imputerD = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

In [None]:
df[['B']] = imputerB.fit_transform(df[['B']])
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,6.0,4.0
1,4.0,11.0,6.0,4.0
2,7.0,10.0,9.0,5.0
3,10.0,11.0,12.0,
4,,14.0,15.0,5.0
5,16.0,17.0,,2.0
6,18.0,12.0,12.0,5.0
7,20.0,11.0,23.0,2.0


In [None]:
df[['C']] = imputerC.fit_transform(df[['C']])
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,6.0,4.0
1,4.0,11.0,6.0,4.0
2,7.0,10.0,9.0,5.0
3,10.0,11.0,12.0,
4,,14.0,15.0,5.0
5,16.0,17.0,11.857143,2.0
6,18.0,12.0,12.0,5.0
7,20.0,11.0,23.0,2.0


In [None]:
df[['A']] = imputerA.fit_transform(df[['A']]).round()
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,6.0,4.0
1,4.0,11.0,6.0,4.0
2,7.0,10.0,9.0,5.0
3,10.0,11.0,12.0,
4,11.0,14.0,15.0,5.0
5,16.0,17.0,11.857143,2.0
6,18.0,12.0,12.0,5.0
7,20.0,11.0,23.0,2.0


In [None]:
df[['D']] = imputerD.fit_transform(df[['D']])
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,6.0,4.0
1,4.0,11.0,6.0,4.0
2,7.0,10.0,9.0,5.0
3,10.0,11.0,12.0,5.0
4,11.0,14.0,15.0,5.0
5,16.0,17.0,11.857143,2.0
6,18.0,12.0,12.0,5.0
7,20.0,11.0,23.0,2.0


In [None]:
round(df.mean(axis=None),3)

9.433

# Exercise 1.2

In [None]:
import numpy as np

In [None]:
def is_vector(array):
    #'b' = boolean, 'u' = unsigned integer, 'i' = signed integer 'f'= float
    return isinstance(array, np.ndarray) and array.ndim == 1 and array.dtype.kind in set('buif')

In [None]:
def distance(vector1, vector2, distance_type='euclidean'):
  if not(is_vector(vector1) and is_vector(vector2)):
    raise TypeError('Parameters are not vectors.')
  if len(vector1) != len(vector2):
    raise ValueError('Vector length is not same.')
  if not(distance_type.lower() in ['euclidean', 'manhattan']):
    raise ValueError("distance_type has to be one the following: 'euclidian', 'manhattan'")

  diff = vector1-vector2
  if distance_type.lower() == 'euclidean':
    #return np.sqrt(np.sum(np.power(vector1-vector2,2)))
    return np.linalg.norm(diff)
  elif distance_type.lower() == 'manhattan':
    return np.sum(np.abs(vector1-vector2))
  else:
    return None

In [None]:
a = np.array([1,2,3])
b = np.array([3,1,5])

print(distance(a,b,'euclidean'))
print(distance(a,b,'manhattan'))

3.0
5


In [None]:
def distance_matrix(matrix, distance_type='euclidean'):
  m = len(matrix)
  distances = np.zeros((m, m))
  for i in range(m):
    for j in range(i, m):
      d = distance(matrix[i], matrix[j], distance_type)
      distances[i, j] = d
      distances[j, i] = d
  return distances

In [None]:
input_matrix = np.array([[1,2,3,5,3],
                         [3,1,5,7,-1],
                         [2,7,1,8,-1],
                         [4,6,1,-2,0],
                         [3,0,-1,2,2],
                         [0,0,0,0,0]])
input_matrix

array([[ 1,  2,  3,  5,  3],
       [ 3,  1,  5,  7, -1],
       [ 2,  7,  1,  8, -1],
       [ 4,  6,  1, -2,  0],
       [ 3,  0, -1,  2,  2],
       [ 0,  0,  0,  0,  0]])

In [None]:
d1 = distance_matrix(input_matrix, 'euclidean')
d2 = distance_matrix(input_matrix, 'manhattan')

In [None]:
d1

array([[ 0.        ,  5.38516481,  7.41619849,  9.32737905,  5.83095189,
         6.92820323],
       [ 5.38516481,  0.        ,  7.34846923, 11.13552873,  8.42614977,
         9.21954446],
       [ 7.41619849,  7.34846923,  0.        , 10.29563014,  9.94987437,
        10.90871211],
       [ 9.32737905, 11.13552873, 10.29563014,  0.        ,  7.81024968,
         7.54983444],
       [ 5.83095189,  8.42614977,  9.94987437,  7.81024968,  0.        ,
         4.24264069],
       [ 6.92820323,  9.21954446, 10.90871211,  7.54983444,  4.24264069,
         0.        ]])

In [None]:
d2

array([[ 0., 11., 15., 19., 12., 14.],
       [11.,  0., 12., 20., 15., 17.],
       [15., 12.,  0., 14., 19., 19.],
       [19., 20., 14.,  0., 15., 13.],
       [12., 15., 19., 15.,  0.,  8.],
       [14., 17., 19., 13.,  8.,  0.]])

In [None]:
# indexes of the max value in matrix
max_indexes = np.unravel_index(d2.argmax(), d2.shape)
max_indexes

(1, 3)

In [None]:
# vectors that have the max distance between each other
print(input_matrix[max_indexes[0]])
print(input_matrix[max_indexes[1]])

[ 3  1  5  7 -1]
[ 4  6  1 -2  0]


In [None]:
print(d2.max().round(2))

20.0


# Exercise 1.3

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
url = 'https://raw.githubusercontent.com/haniemi/deeplearning/main/data/airbnb.csv'

In [None]:
df = pd.read_csv(url, index_col=0)
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [None]:
len(df)

48895

In [None]:
X = df.drop(columns = ['room_type']).copy()
y = df['room_type']

In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state = 125,
                                                    stratify=y)

In [None]:
X_val, X_test, y_val, y_test = train_test_split(X_temp,
                                                y_temp,
                                                test_size=0.8,
                                                random_state = 125,
                                                stratify=y_temp)

In [None]:
print('Entire data', len(y), df['room_type'].value_counts()/len(df))
print()
print('Train data', len(y_train), y_train.value_counts()/len(y_train))
print()
print('Validation data',len(y_val), y_val.value_counts()/len(y_val))
print()
print('Test data', len(y_test), y_test.value_counts()/len(y_test))

Entire data 48895 room_type
Entire home/apt    0.519665
Private room       0.456611
Shared room        0.023724
Name: count, dtype: float64

Train data 36671 room_type
Entire home/apt    0.519675
Private room       0.456601
Shared room        0.023724
Name: count, dtype: float64

Validation data 2444 room_type
Entire home/apt    0.519640
Private room       0.456628
Shared room        0.023732
Name: count, dtype: float64

Test data 9780 room_type
Entire home/apt    0.519632
Private room       0.456646
Shared room        0.023722
Name: count, dtype: float64


In [None]:
y_test.value_counts()['Private room']

4466

In [None]:
y_test.value_counts()

Unnamed: 0_level_0,count
room_type,Unnamed: 1_level_1
Entire home/apt,5082
Private room,4466
Shared room,232
