In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from keras.preprocessing.image import ImageDataGenerator
import tensorflow as tf

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
path = "/content/gdrive/MyDrive/Takeo/Housing California/housing.csv"
df = pd.read_csv(path)
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [4]:
# How many rows and columns are there
print(df.head())
print()
print("Shape : ", df.shape)

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  
2       496.0       177.0         7.2574            352100.0        NEAR BAY  
3       558.0       219.0         5.6431            341300.0        NEAR BAY  
4       565.0       259.0         3.8462            342200.0        NEAR BAY  

Shape :  (20640, 10)


In [5]:
# summary of statistics
print(df.describe())

          longitude      latitude  housing_median_age   total_rooms  \
count  20640.000000  20640.000000        20640.000000  20640.000000   
mean    -119.569704     35.631861           28.639486   2635.763081   
std        2.003532      2.135952           12.585558   2181.615252   
min     -124.350000     32.540000            1.000000      2.000000   
25%     -121.800000     33.930000           18.000000   1447.750000   
50%     -118.490000     34.260000           29.000000   2127.000000   
75%     -118.010000     37.710000           37.000000   3148.000000   
max     -114.310000     41.950000           52.000000  39320.000000   

       total_bedrooms    population    households  median_income  \
count    20433.000000  20640.000000  20640.000000   20640.000000   
mean       537.870553   1425.476744    499.539680       3.870671   
std        421.385070   1132.462122    382.329753       1.899822   
min          1.000000      3.000000      1.000000       0.499900   
25%        296.00000

In [6]:
# Finding the datatypes
print(df.dtypes)

longitude             float64
latitude              float64
housing_median_age    float64
total_rooms           float64
total_bedrooms        float64
population            float64
households            float64
median_income         float64
median_house_value    float64
ocean_proximity        object
dtype: object


In [7]:
# Check for missing values
missing_values = df.isnull().sum()
print(missing_values)

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64


In [8]:
#Calculate the missing value as median because there are outliers and the outliers would have a majaor impact is we use mean()
median_total_bedrooms = df['total_bedrooms'].median()
df['total_bedrooms'].fillna(median_total_bedrooms, inplace=True)


In [9]:
# Check for missing values
missing_values = df.isnull().sum()
print(missing_values)

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64


In [10]:
# Check if any row has 'total_rooms' smaller than 'total_bedrooms'
rows_with_rooms_smaller_than_bedrooms = df[df['total_rooms'] < df['total_bedrooms']]

# Check if there are any such rows
if not rows_with_rooms_smaller_than_bedrooms.empty:
    print("Rows with 'total_rooms' smaller than 'total_bedrooms':")
    print(rows_with_rooms_smaller_than_bedrooms)
else:
    print("No rows with 'total_rooms' smaller than 'total_bedrooms' found.")

Rows with 'total_rooms' smaller than 'total_bedrooms':
       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
2826     -119.11     35.42                52.0        154.0           435.0   
9877     -121.85     36.61                38.0        238.0           435.0   
9942     -122.23     38.17                45.0        350.0           435.0   
13597    -117.28     34.09                44.0        376.0           435.0   
19391    -120.85     37.78                25.0        421.0           435.0   

       population  households  median_income  median_house_value  \
2826         37.0        16.0        10.0263            200000.0   
9877        191.0        67.0         1.3897            125000.0   
9942        225.0        72.0         1.8942            216700.0   
13597       273.0       107.0         2.2917             90800.0   
19391       303.0       106.0         2.2679             71300.0   

      ocean_proximity  
2826           INLAND  
9877        <

In [11]:
# Moving on to dropping duplicated becuase it affects ther data quality and data size
df.drop_duplicates()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [12]:
# Next step identifying the target variable which is the median_house_value
# Dependent variable is our target variable X :
# Indetpendent variables which we are going to use to predict the Y:
      # "longitude"
      # "latitude"
      # "housing_median_age"
      # "total_rooms"
      # "total_bedrooms"
      # "population"
      # "households"
      # "median_income"
      # "ocean_proximity"

In [13]:
X = df.iloc[:, [1,2,3,4,5,6,7,9]].values
y = df['median_house_value']
y = df.iloc[:,-7]
print("x")
print(X)
print()
print("y")
print(y)

x
[[37.88 41.0 880.0 ... 126.0 8.3252 'NEAR BAY']
 [37.86 21.0 7099.0 ... 1138.0 8.3014 'NEAR BAY']
 [37.85 52.0 1467.0 ... 177.0 7.2574 'NEAR BAY']
 ...
 [39.43 17.0 2254.0 ... 433.0 1.7 'INLAND']
 [39.43 18.0 1860.0 ... 349.0 1.8672 'INLAND']
 [39.37 16.0 2785.0 ... 530.0 2.3886 'INLAND']]

y
0         880.0
1        7099.0
2        1467.0
3        1274.0
4        1627.0
          ...  
20635    1665.0
20636     697.0
20637    2254.0
20638    1860.0
20639    2785.0
Name: total_rooms, Length: 20640, dtype: float64


In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Apply one-hot encoding to categorical features
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [8])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

# # Split the data into training and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


ValueError: ignored

In [None]:
# from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import OneHotEncoder

# # Check the number of columns in your DataFrame
# num_columns = df.shape[1]

# # Set up the ColumnTransformer
# ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(sparse=False), ["ocean_proximity"])], remainder='passthrough')
# X = ct.fit_transform(X)

# # Now, 'ocean_proximity' should be encoded as binary values, and the rest of the columns are included as is.


In [15]:
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


#**Train and Test Data**

In [16]:
from sklearn.model_selection import train_test_split

In [21]:
# Splitting the data
# X = df.drop(columns=['median_house_value'])  # Features
# y = df['median_house_value']  # Target

# Split the data into training and test sets 80% training, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#**Feature Selection**

In [19]:
from sklearn.preprocessing import StandardScaler

In [22]:
sc = StandardScaler()
# Fit and transform the training data
X_train = sc.fit_transform(X_train)
# Transform the test data using the same scaler
X_test = sc.transform(X_test)

ValueError: ignored

In [23]:
X_train

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
14196,-117.03,32.71,33.0,3126.0,627.0,2300.0,623.0,3.2596,NEAR OCEAN
8267,-118.16,33.77,49.0,3382.0,787.0,1314.0,756.0,3.8125,NEAR OCEAN
17445,-120.48,34.66,4.0,1897.0,331.0,915.0,336.0,4.1563,NEAR OCEAN
14265,-117.11,32.69,36.0,1421.0,367.0,1418.0,355.0,1.9425,NEAR OCEAN
2271,-119.80,36.78,43.0,2382.0,431.0,874.0,380.0,3.5542,INLAND
...,...,...,...,...,...,...,...,...,...
11284,-117.96,33.78,35.0,1330.0,201.0,658.0,217.0,6.3700,<1H OCEAN
11964,-117.43,34.02,33.0,3084.0,570.0,1753.0,449.0,3.0500,INLAND
5390,-118.38,34.03,36.0,2101.0,569.0,1756.0,527.0,2.9344,<1H OCEAN
860,-121.96,37.58,15.0,3575.0,597.0,1777.0,559.0,5.7192,<1H OCEAN


#**Part II: Artificial Neural Network**

In [24]:
import tensorflow as tf

In [25]:
tf.__version__

'2.14.0'

#**Building ANN**

#**Initializing ANN**

In [26]:
# sets up a foundation for creating a neural network model using TensorFlow.
# You can then add layers, configure the model architecture, compile the model,
# and train it for a specific task
ann = tf.keras.models.Sequential()

#**Adding first hidden layer**

In [27]:
# ann.add(): This function is used to add a layer to the Sequential model (ann in this case).

# tf.keras.layers.Dense(): This line adds a fully connected layer (Dense layer) to the model.

# units=6: Specifies that this layer will have 6 units.
# In a Dense layer, each unit (or neuron) is connected to each neuron in the previous and the next layers.

# activation='relu': Defines the activation function for this layer, which is Rectified Linear Unit (ReLU).
# ReLU is a common activation function used in hidden layers of neural networks,
# helping introduce non-linearity.

ann.add(tf.keras.layers.Dense(units=6, activation='relu'))
# RELU FUNCTION GOES FROM 0-1
# lINEAR IN NATURE
# Adding a Dense hidden layer with 6 units and ReLU activation

#**Adding the second hidden layer**

In [28]:
ann.add(tf.keras.layers.Dense(units=6, activation='relu'))

#**Adding the third hidden layer**

In [29]:

# ann.add(): This function is used to add a layer to the Sequential model (ann in this case).

# tf.keras.layers.Dense(): This line adds a fully connected layer (Dense layer) to the model.

# units=1: Specifies that this layer will have 1 unit.
# In this case, since it's the output layer, a single unit is used to represent the output.

# activation='sigmoid': Defines the activation function for this layer,
#  which is the sigmoid activation function.
#  Sigmoid is commonly used in the output layer of a binary classification model to produce
#  probabilities that sum to 1. It's suitable for binary classification problems.
ann.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

#**Compile ANN**

In [30]:
# ann.compile():
  # This function configures the model for training.

# optimizer='adam':
    # Adam is a popular optimization algorithm used for training neural networks.
    # It adapts the learning rates for each parameter, leading to faster convergence.

# loss='binary_crossentropy':
    # Binary crossentropy is the loss function used for binary classification problems.
    # It measures the difference between the true labels and the predicted probabilities
    # for each sample.

# metrics=['accuracy']:
# During training, it calculates and displays the accuracy of the model as one of the metrics.
# Accuracy is a commonly used metric for classification tasks, indicating the proportion of
# correctly classified samples.

ann.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

#**Training the ANN on the Training set**

In [31]:
# ann.fit():
#   This function trains the model using the specified training data.

# X_train:
#   The input features for training the model.

# y_train:
#   The corresponding target labels for the training data.

# batch_size=32:
#   This is the number of samples per gradient update.
#   The model's weights are updated after processing each batch of 32 samples.

# epochs=15:
#   An epoch is one pass through the entire training dataset. In this case, the model will be trained for 15 epochs, meaning it will go through the entire training dataset 15 times during the training process.

ann.fit(X_train, y_train, batch_size=32, epochs=5)

ValueError: ignored