In [36]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.ensemble import RandomForestRegressor

In [3]:
df = pd.read_csv('diamond.csv')
df.head()

Unnamed: 0,Carat Weight,Cut,Color,Clarity,Polish,Symmetry,Report,Price
0,1.1,Ideal,H,SI1,VG,EX,GIA,5169
1,0.83,Ideal,H,VS1,ID,ID,AGSL,3470
2,0.85,Ideal,H,SI1,EX,EX,GIA,3183
3,0.91,Ideal,E,SI1,VG,VG,GIA,4370
4,0.83,Ideal,G,SI1,EX,EX,GIA,3171


In [4]:
df['Cut'].value_counts()

Ideal              2482
Very Good          2428
Good                708
Signature-Ideal     253
Fair                129
Name: Cut, dtype: int64

In [5]:
df['Color'].value_counts()

G    1501
H    1079
F    1013
I     968
E     778
D     661
Name: Color, dtype: int64

In [6]:
df['Clarity'].unique()

array(['SI1', 'VS1', 'VS2', 'VVS2', 'VVS1', 'IF', 'FL'], dtype=object)

In [7]:
df['Polish'].unique()

array(['VG', 'ID', 'EX', 'G'], dtype=object)

In [8]:
df['Symmetry'].unique()

array(['EX', 'ID', 'VG', 'G'], dtype=object)

In [9]:
df['Report'].unique()

array(['GIA', 'AGSL'], dtype=object)

In [10]:
def encode(df, *args):
    le = LabelEncoder()
    for col in args:
        df[col] = le.fit_transform(df[col])

In [11]:
unique = ['Report', 'Symmetry', 'Polish', 'Clarity', 'Color', 'Cut']

In [12]:
encode(df, 'Report', 'Symmetry', 'Polish', 'Clarity', 'Color', 'Cut')

In [13]:
df.head()

Unnamed: 0,Carat Weight,Cut,Color,Clarity,Polish,Symmetry,Report,Price
0,1.1,2,4,2,3,0,1,5169
1,0.83,2,4,3,2,2,0,3470
2,0.85,2,4,2,0,0,1,3183
3,0.91,2,1,2,3,3,1,4370
4,0.83,2,3,2,0,0,1,3171


In [14]:
for col in unique:
    print(f'Unique elements of {col} are {df[col].unique()}')

Unique elements of Report are [1 0]
Unique elements of Symmetry are [0 2 3 1]
Unique elements of Polish are [3 2 0 1]
Unique elements of Clarity are [2 3 4 6 5 1 0]
Unique elements of Color are [4 1 3 0 2 5]
Unique elements of Cut are [2 4 0 1 3]


In [15]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [16]:
y

array([ 5169,  3470,  3183, ...,  6157, 11206, 30507])

In [17]:
X

array([[1.1 , 2.  , 4.  , ..., 3.  , 0.  , 1.  ],
       [0.83, 2.  , 4.  , ..., 2.  , 2.  , 0.  ],
       [0.85, 2.  , 4.  , ..., 0.  , 0.  , 1.  ],
       ...,
       [1.02, 2.  , 0.  , ..., 0.  , 0.  , 1.  ],
       [1.27, 3.  , 3.  , ..., 0.  , 0.  , 1.  ],
       [2.19, 2.  , 1.  , ..., 0.  , 0.  , 1.  ]])

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 0)

In [25]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [26]:
X_train

array([[-0.7 , -0.71, -0.51, ...,  1.04,  1.05,  0.09],
       [-0.08, -0.28,  0.66, ...,  1.58, -1.17,  0.66],
       [-0.37,  0.82, -0.59, ...,  0.92,  0.92, -0.04],
       ...,
       [ 0.44,  0.46,  0.39, ...,  0.7 , -0.55, -0.28],
       [ 0.01, -0.76, -0.56, ..., -1.09, -1.17,  0.01],
       [-0.56, -1.5 ,  0.31, ...,  1.21, -0.37,  0.26]])

In [27]:
reg = LinearRegression()
reg.fit(X_train, y_train);

In [28]:
y_pred = reg.predict(X_test)

In [29]:
np.set_printoptions(precision=2)
np.concatenate((y_test.reshape(len(y_test), 1), y_pred.reshape(len(y_pred), 1)), 1)

array([[ 5042.  ,  5946.83],
       [ 5829.  ,  9365.06],
       [ 7508.  , 10191.42],
       ...,
       [35955.  , 25711.82],
       [ 7031.  ,  6168.38],
       [13631.  , 18106.34]])

In [30]:
r2_score(y_test, y_pred) * 100

66.34023073850823

In [31]:
regressor = DecisionTreeRegressor()
regressor.fit(X_train, y_train);

In [33]:
r2_score(y_test, regressor.predict(X_test)) * 100

88.82806797341613

In [34]:
newpred = regressor.predict(X_test)

In [35]:
newpred

array([ 5253.5,  5991. ,  7639. , ..., 30348. ,  6855. , 13357. ])

In [37]:
forest = RandomForestRegressor()
forest.fit(X_train, y_train);

In [38]:
r2_score(y_test, forest.predict(X_test)) * 100

93.3454257003599

In [39]:
np.set_printoptions(precision=2)
np.concatenate((y_test.reshape(len(y_test), 1), forest.predict(X_test).reshape(len(forest.predict(X_test)), 1)), 1)

array([[ 5042.  ,  5268.69],
       [ 5829.  ,  6307.15],
       [ 7508.  ,  7529.83],
       ...,
       [35955.  , 37780.47],
       [ 7031.  ,  6786.83],
       [13631.  , 13762.18]])