In [15]:
from csv import reader
from math import sqrt
from operator import itemgetter
from random import randrange


# Load a CSV file
def load_csv(filename):
    dataset = []
    with open(filename, 'r') as file:
        csv_reader = reader(file)
        for row in csv_reader:
            if not row:
                continue
            try:
                row.append(float(row[8]) * float(row[9]) * float(row[10]))
            except ValueError:
                row.append("volume")
            dataset.append(row[1:])
    return dataset


# Convert string column to float
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column].strip())


# Convert string column to int
def str_column_to_int(dataset, column):
    for row in dataset:
        row[column] = int(row[column].strip())


# Convert depth column to numerical value
def convert_depth_to_int(dataset, column):
    depths = {"Premium": 1, "Ideal": 2, "Excellent": 3, "Very Good": 4, "Good": 5, "Fair": 6, "Poor": 7}
    for row in dataset:
        value = depths[row[column].strip()]
        row[column] = int(value)


# Convert clarity column to numerical value
def convert_clarity_to_int(dataset, column):
    clarities = {"IF": 1, "VVS1": 2, "VVS2": 3, "VS1": 4, "VS2": 5, "SI1": 6, "SI2": 7, "I1": 8, "I2": 9}
    for row in dataset:
        value = clarities[row[column].strip()]
        row[column] = int(value)


# Convert color column to numerical value
def convert_color_to_int(dataset, column):
    for row in dataset:
        value = ord(row[column].strip())
        row[column] = int(value)


# Find the min and max values for each column
def dataset_minmax(dataset):
    minmax = list()
    for i in range(len(dataset[0])):
        col_values = [row[i] for row in dataset]
        value_min = min(col_values)
        value_max = max(col_values)
        minmax.append([value_min, value_max])
    return minmax


# normalize dataset to the range 0-1
def normalize_dataset(dataset, minmax):
    for row in dataset:
        for i in range(len(row)):
            row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])


# convert test row to the format of dataset
def adjust_test_row(test_row):
    test_row = test_row[1:]
    str_column_to_float([test_row], 0)
    convert_depth_to_int([test_row], 1)
    convert_color_to_int([test_row], 2)
    convert_clarity_to_int([test_row], 3)
    str_column_to_float([test_row], 4)
    str_column_to_float([test_row], 5)
    str_column_to_int([test_row], 6)
    str_column_to_float([test_row], 7)
    str_column_to_float([test_row], 8)
    str_column_to_float([test_row], 9)
    test_row.append(float(test_row[7]) * float(test_row[8]) * float(test_row[9]))
    return test_row

# Make a prediction with coefficients
def predict(row, coefficients):
    pred = coefficients[0]
    for i in range(len(row) - 1):
        pred += coefficients[i + 1] * row[i]
    return pred

# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / n_folds)
    for i in range(n_folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split


# Calculate root mean squared error
def rmse_metric(actual, predicted):
    sum_error = 0.0
    for i in range(len(actual)):
        prediction_error = predicted[i] - actual[i]
        sum_error += (prediction_error ** 2)
    mean_error = sum_error / float(len(actual))
    return sqrt(mean_error)


# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
    folds = cross_validation_split(dataset, n_folds)
    scores = list()
    for fold in folds:
        train_set = list(folds)
        train_set.remove(fold)
        train_set = sum(train_set, [])
        test_set = list()
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
            row_copy[-1] = None
        predicted = algorithm(train_set, test_set, *args)
        actual = [row[-1] for row in fold]
        rmse = rmse_metric(actual, predicted)
        scores.append(rmse)
    return scores


# Estimate linear regression coefficients using stochastic gradient descent
def coefficients_sgd(train, l_rate, n_epoch):
    coef = [0.0 for i in range(len(train[0]))]
    for epoch in range(n_epoch):
        for row in train:
            pred = predict(row, coef)
            error = pred - row[-1]
            coef[0] = coef[0] - l_rate * error
            for i in range(len(row) - 1):
                coef[i + 1] = coef[i + 1] - l_rate * error * row[i]
        # print(l_rate, n_epoch, error)
    return coef


# Linear Regression Algorithm With Stochastic Gradient Descent
def linear_regression_sgd(train, test, l_rate, n_epoch):
    predictions = list()
    coef = coefficients_sgd(train, l_rate, n_epoch)
    for row in test:
        pred = predict(row, coef)
        predictions.append(pred)
    return (predictions)

if __name__ == "__main__":
    filename = "diamonds.csv"
    dataset = load_csv(filename)
    column_names = dataset.pop(0)

    # convert values of dataset

    str_column_to_float(dataset, 0)
    convert_depth_to_int(dataset, 1)
    convert_color_to_int(dataset, 2)
    convert_clarity_to_int(dataset, 3)
    str_column_to_float(dataset, 4)
    str_column_to_float(dataset, 5)
    str_column_to_int(dataset, 6)
    str_column_to_float(dataset, 7)
    str_column_to_float(dataset, 8)
    str_column_to_float(dataset, 9)
    # column with volume is already float

    # normalize dataset
    minmax_values = dataset_minmax(dataset)
    normalize_dataset(dataset, minmax_values)

    # for row in dataset:
    #     print(row)

    # evaluate algorithm
    n_folds = 5
    l_rate = 0.01
    n_epoch = 50
    scores = evaluate_algorithm(dataset, linear_regression_sgd, n_folds, l_rate, n_epoch)
    print('Scores: %s' % scores)
    print('Mean RMSE: %.3f' % (sum(scores) / float(len(scores))))
    print('price for the 1000th diamond is: '+ str(predict(dataset[1000], coeff)))


Scores: [0.0011861333359474327, 0.0015462442812448475, 0.0013700745991601922, 0.008969640753563081, 0.0017267586162888983]
Mean RMSE: 0.003


NameError: name 'coefficients' is not defined