<a href="https://colab.research.google.com/github/ashwinkhapre/FMML-Projects_Ashwin_Khapre/blob/main/Module2FMML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
from sklearn import datasets
import matplotlib.pyplot as plt

# Set random seed
rng = np.random.default_rng(seed=42)

# Load the California Housing dataset
dataset = datasets.fetch_california_housing()

# Dataset description
print(dataset.DESCR)
print("Original target values:", dataset.target)

# Convert target values to integers
dataset.target = dataset.target.astype(int)

print("Target values after conversion:", dataset.target)
print("Input variables shape:", dataset.data.shape)
print("Output variables shape:", dataset.target.shape)

def NN3(traindata, trainlabel, query):
    diff = traindata - query  # find the difference between features
    sq = diff * diff  # square the differences
    dist = sq.sum(1)  # add up the squares to get distances

    # Find the indices of the three smallest distances
    nearest_indices = np.argsort(dist)[:3]

    # Get the labels of the three nearest neighbors
    nearest_labels = trainlabel[nearest_indices]

    # Return the most common label (majority vote)
    label = np.bincount(nearest_labels).argmax()
    return label

def NN(traindata, trainlabel, testdata):
    # Predict the label for each test point using NN3
    predlabel = np.array([NN3(traindata, trainlabel, i) for i in testdata])
    return predlabel

def split(data, label, percent):
    # Generate a random number for each sample
    rnd = rng.random(len(label))
    split1 = rnd < percent
    split2 = rnd >= percent

    split1data = data[split1, :]
    split1label = label[split1]
    split2data = data[split2, :]
    split2label = label[split2]
    return split1data, split1label, split2data, split2label

def Accuracy(true_labels, predicted_labels):
    return np.mean(true_labels == predicted_labels)

# Split data into train/validation and test sets (30% test data)
alltraindata, alltrainlabel, testdata, testlabel = split(
    dataset.data, dataset.target, 70 / 100
)

print("Number of test samples:", len(testlabel))
print("Number of train samples:", len(alltrainlabel))
print("Percent of test data:", len(testlabel) * 100 / len(dataset.target), "%")

# Split the train/validation set further into training and validation sets (75% train data)
traindata, trainlabel, valdata, vallabel = split(
    alltraindata, alltrainlabel, 75 / 100
)

# Predict on the training data
trainpred = NN(traindata, trainlabel, traindata)

# Calculate training accuracy
trainAccuracy = Accuracy(trainlabel, trainpred)
print("Training accuracy using nearest neighbor algorithm:", trainAccuracy * 100, "%")

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived

In [None]:
import numpy as np
from sklearn import datasets
import matplotlib.pyplot as plt

# Set random seed
rng = np.random.default_rng(seed=42)

# Load the California Housing dataset
dataset = datasets.fetch_california_housing()

# Dataset description
print(dataset.DESCR)
print("Original target values:", dataset.target)

# Convert target values to integers
dataset.target = dataset.target.astype(int)

print("Target values after conversion:", dataset.target)
print("Input variables shape:", dataset.data.shape)
print("Output variables shape:", dataset.target.shape)

def NN3(traindata, trainlabel, query):
    diff = traindata - query  # find the difference between features
    sq = diff * diff  # square the differences
    dist = sq.sum(1)  # add up the squares to get distances

    # Find the indices of the three smallest distances
    nearest_indices = np.argsort(dist)[:3]

    # Get the labels of the three nearest neighbors
    nearest_labels = trainlabel[nearest_indices]

    # Return the most common label (majority vote)
    label = np.bincount(nearest_labels).argmax()
    return label

def NN(traindata, trainlabel, testdata):
    # Predict the label for each test point using NN3
    predlabel = np.array([NN3(traindata, trainlabel, i) for i in testdata])
    return predlabel

def split(data, label, percent):
    # Generate a random number for each sample
    rnd = rng.random(len(label))
    split1 = rnd < percent
    split2 = rnd >= percent

    split1data = data[split1, :]
    split1label = label[split1]
    split2data = data[split2, :]
    split2label = label[split2]
    return split1data, split1label, split2data, split2label

def Accuracy(true_labels, predicted_labels):
    return np.mean(true_labels == predicted_labels)

# Split data into train/validation and test sets (30% test data)
alltraindata, alltrainlabel, testdata, testlabel = split(
    dataset.data, dataset.target, 70 / 100
)

print("Number of test samples:", len(testlabel))
print("Number of train samples:", len(alltrainlabel))
print("Percent of test data:", len(testlabel) * 100 / len(dataset.target), "%")

# Split the train/validation set further into training and validation sets (75% train data)
traindata, trainlabel, valdata, vallabel = split(
    alltraindata, alltrainlabel, 75 / 100
)

# Predict on the training data
trainpred = NN(traindata, trainlabel, traindata)

# Calculate training accuracy
trainAccuracy = Accuracy(trainlabel, trainpred)
print("Training accuracy using nearest neighbor algorithm:", trainAccuracy * 100, "%")

def AverageAccuracy(alldata, alllabel, splitpercent, iterations, classifier=NN):
    accuracy = 0  # Initialize accuracy to 0
    for ii in range(iterations):
        # Split the data into training and validation sets
        traindata, trainlabel, valdata, vallabel = split(alldata, alllabel, splitpercent)

        # Predict validation labels using the classifier
        valpred = classifier(traindata, trainlabel, valdata)

        # Accumulate accuracy
        accuracy += Accuracy(vallabel, valpred)

    return accuracy / iterations  # Average of all accuracies

# Calculate average accuracy over 10 iterations using 3-NN
avg_acc = AverageAccuracy(alltraindata, alltrainlabel, 75 / 100, 10, classifier=NN)

print("Average validation accuracy:", avg_acc * 100, "%")

# Predict on test data using 3-NN
testpred = NN(alltraindata, alltrainlabel, testdata)

# Print test accuracy
print("Test accuracy:", Accuracy(testlabel, testpred) * 100, "%")

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived