In [3]:
import pandas as pd

df = pd.read_csv('data/diamonds.csv', index_col=0)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [4]:
df['cut'].unique()

array(['Ideal', 'Premium', 'Good', 'Very Good', 'Fair'], dtype=object)

In [5]:
cut_class_dict = {'Fair': 1, 'Good': 2, 'Very Good': 3, 'Premium': 4, 'Ideal': 5}

In [6]:
df['clarity'].unique()

array(['SI2', 'SI1', 'VS1', 'VS2', 'VVS2', 'VVS1', 'I1', 'IF'],
      dtype=object)

In [7]:
# From the dataset descriptions, these are the order from worst to best.
clarity_dict = {"I3": 1, "I2": 2, "I1": 3, "SI2": 4, "SI1": 5, "VS2": 6, "VS1": 7, "VVS2": 8, "VVS1": 9, "IF": 10, "FL": 11}
color_dict = {"J": 1,"I": 2,"H": 3,"G": 4,"F": 5,"E": 6,"D": 7}

In [8]:
df['cut'] = df['cut'].map(cut_class_dict)
df['clarity'] = df['clarity'].map(clarity_dict)
df['color'] = df['color'].map(color_dict)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,5,6,4,61.5,55.0,326,3.95,3.98,2.43
2,0.21,4,6,5,59.8,61.0,326,3.89,3.84,2.31
3,0.23,2,6,7,56.9,65.0,327,4.05,4.07,2.31
4,0.29,4,2,6,62.4,58.0,334,4.2,4.23,2.63
5,0.31,2,1,4,63.3,58.0,335,4.34,4.35,2.75


In [9]:
!pip install scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30.5 MB)
Collecting joblib>=1.0.0
  Using cached joblib-1.2.0-py3-none-any.whl (297 kB)
Collecting threadpoolctl>=2.0.0
  Using cached threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Collecting scipy>=1.3.2
  Using cached scipy-1.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (33.7 MB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.2.0 scikit-learn-1.1.3 scipy-1.9.3 threadpoolctl-3.1.0


In [10]:
import sklearn
from sklearn.linear_model import SGDRegressor

df = sklearn.utils.shuffle(df)

X = df.drop('price', axis=1).values  # .values converts dataframe to a numpy array
y = df['price'].values

In [14]:
test_size = 200

X_train = X[:-test_size]
y_train = y[:-test_size]

X_test = X[-test_size:]
y_test = y[-test_size:]

In [15]:
clf = SGDRegressor(max_iter=1000)
clf.fit(X_train, y_train)

print(clf.score(X_test, y_test))

-27956899.920587152


In [16]:
for X, y in list(zip(X_test, y_test))[:10]:
    print(clf.predict([X])[0], y)

1860320.9642400742 478
-1645825.97908926 789
9872928.83842516 6274
-35028960.3707006 2855
-4320000.621200323 5544
3768297.2292284966 827
-5940217.2981693745 12320
16709478.383116245 776
11932831.690441608 5814
-6830232.503458738 858


In [17]:
from sklearn import svm

clf = svm.SVR()

clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

-0.14246021414654653


In [18]:
for X, y in list(zip(X_test, y_test))[:10]:
    print(clf.predict([X])[0], y)

2292.6774244755293 478
2307.9273151421503 789
2448.800449480451 6274
2532.8514190501214 2855
2476.7518626608453 5544
2322.4891328456556 827
2484.183185017938 12320
2330.2925845892933 776
2436.7262976925103 5814
2356.8404456498865 858


In [19]:
clf = SGDRegressor(max_iter=10000)

clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

for X, y in list(zip(X_test, y_test))[:10]:
    print(clf.predict([X])[0], y)

-12725430.002435746
10142082.069859505 478
12957922.170935512 789
3106096.4399757385 6274
39193560.71692777 2855
14349560.400743365 5544
8466260.543792248 827
15318150.91146779 12320
-1627444.559006691 776
1587641.1072716713 5814
15460537.630450726 858


In [20]:
import sklearn
from sklearn import svm, preprocessing

df = sklearn.utils.shuffle(df)

X = df.drop('price', axis=1).values
X = preprocessing.scale(X)
y = df['price'].values

test_size = 200

X_train = X[:-test_size]
y_train = y[:-test_size]

X_test = X[-test_size:]
y_test = y[-test_size:]

clf = svm.SVR()

clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

for X, y in list(zip(X_test, y_test))[:10]:
    print(f'model predicts {clf.predict([X])[0]}, real value: {y}')

0.5931205160528983
model predicts 931.8926877440103, real value: 710
model predicts 1880.011576336652, real value: 1666
model predicts 1547.482566680807, real value: 1689
model predicts 727.3846528872987, real value: 529
model predicts 981.9982626764995, real value: 874
model predicts 2379.4970709682884, real value: 1202
model predicts 1445.137315713883, real value: 1197
model predicts 4287.235356870057, real value: 4308
model predicts 7218.070635786597, real value: 10162
model predicts 1441.6613266069205, real value: 633
