In [1]:
#Pandas Library Import
import pandas as pd

#Dataset Link = https://www.kaggle.com/shivam2503/diamonds

df = pd.read_csv("datasets/diamonds.csv", index_col=0)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [2]:
#As this is a regression model not classification, we will have to convert categorical variable to numerical
df['cut'].unique()

array(['Ideal', 'Premium', 'Good', 'Very Good', 'Fair'], dtype=object)

In [3]:
cut_class_dict = {"Fair": 1, "Good": 2, "Very Good": 3, "Premium": 4, "Ideal": 5}
# We will have to convert the Categorical variables as per increasing quality of diamond as this bias affected the price of diamond

Above graph Dates are not in proper order.

In [4]:
# We will do similar conversion for other categorical variables
df['clarity'].unique()

array(['SI2', 'SI1', 'VS1', 'VS2', 'VVS2', 'VVS1', 'I1', 'IF'],
      dtype=object)

In [5]:
# FL,IF, VVS1, VVS2, VS1, VS2, SI1, SI2, I1, I2, I3 - Taken from the dataset page, this is ordered best to worst, so now we need this in a dict too.

#We also have color. D is the best, J is the worst.

clarity_dict = {"I3": 1, "I2": 2, "I1": 3, "SI2": 4, "SI1": 5, "VS2": 6, "VS1": 7, "VVS2": 8, "VVS1": 9, "IF": 10, "FL": 11}
color_dict = {"J": 1,"I": 2,"H": 3,"G": 4,"F": 5,"E": 6,"D": 7}

In [6]:
df['cut'] = df['cut'].map(cut_class_dict)
df['clarity'] = df['clarity'].map(clarity_dict)
df['color'] = df['color'].map(color_dict)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,5,6,4,61.5,55.0,326,3.95,3.98,2.43
2,0.21,4,6,5,59.8,61.0,326,3.89,3.84,2.31
3,0.23,2,6,7,56.9,65.0,327,4.05,4.07,2.31
4,0.29,4,2,6,62.4,58.0,334,4.2,4.23,2.63
5,0.31,2,1,4,63.3,58.0,335,4.34,4.35,2.75


In [7]:
import sklearn
from sklearn.linear_model import SGDRegressor

df = sklearn.utils.shuffle(df) # always shuffle your data to avoid any biases that may emerge b/c of some order.

X = df.drop("price", axis=1).values
y = df["price"].values

In [8]:
test_size = 200

#Splitting Train and Test data
X_train = X[:-test_size]
y_train = y[:-test_size]

X_test = X[-test_size:]
y_test = y[-test_size:]

In [9]:
# We train and test our classifier!

clf = SGDRegressor(max_iter=1000)
clf.fit(X_train, y_train)

print(clf.score(X_test, y_test))

#Apparaently score should be between from 0 to 1. But we are getting in negative very high value.



-807007084.7522919


In [10]:
for X,y in list(zip(X_test, y_test))[:10]:
    print(clf.predict([X])[0], y)

105816989.54326248 675
51172131.42722511 449
156460468.2879095 3620
104937729.85040951 948
109021749.91700077 3816
-12760113.240434647 18757
161678778.96797943 3199
259964329.13995552 5809
33404864.44312668 614
130603480.97911644 3262


In [11]:
# SVR
from sklearn import svm

clf = svm.SVR()

clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))



0.326010585649429


In [12]:
# Difference between svm.SVR() and the SGDRegressor according to the docs is that svm.SVR() by default has an unlimited number of iterations.
# Let's try that with the SGDRegressor to be fair, by setting it to something quite large like say 10,000.

clf = SGDRegressor(max_iter=10000)

clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

for X,y in list(zip(X_test, y_test))[:10]:
    print(clf.predict([X])[0], y)



-237.4277457894025
8065.98655039724 675
59711.70486540999 449
-46598.49655639287 3620
8949.663886048831 948
11978.883507827297 3816
156898.76105364505 18757
-50519.03423909098 3199
-156877.1032398399 5809
88092.38108220883 614
-15380.85628866218 3262


In [14]:
# We will do modelling by scaling the variables
import sklearn
from sklearn import svm, preprocessing

df = sklearn.utils.shuffle(df) # always shuffle your data to avoid any biases that may emerge b/c of some order.

X = df.drop("price", axis=1).values
X = preprocessing.scale(X)
y = df["price"].values

test_size = 200

X_train = X[:-test_size]
y_train = y[:-test_size]

X_test = X[-test_size:]
y_test = y[-test_size:]

clf = svm.SVR()

clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

for X,y in list(zip(X_test, y_test))[:10]:
    print(f"model predicts {clf.predict([X])[0]}, real value: {y}")



0.5071461893058302
model predicts 1584.4192923297137, real value: 596
model predicts 2008.0510694284412, real value: 1556
model predicts 279.6343837016525, real value: 596
model predicts 4386.293360414904, real value: 3808
model predicts 932.0689201164537, real value: 489
model predicts 5543.854544376053, real value: 7885
model predicts 2711.9201828615232, real value: 2351
model predicts 735.5180168591269, real value: 828
model predicts 1998.756795401752, real value: 714
model predicts 3971.074153588113, real value: 3959
