In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
t_data  = pd.read_csv('data/train.csv')

In [3]:
p_data = pd.read_csv('data/predict.csv')

In [4]:
# Checking null values
print(f"t_nulls:{t_data.isna().sum().sum()}")
print(f"p_nulls:{p_data.isna().sum().sum()}")

t_nulls:0
p_nulls:0


## Model 1: Plain Vanila ML Linear Regression

In [5]:
# Plain Vanilla ML Regression model

# I. Developing the model with train

# X and Y splitting
y = t_data['price']
X = t_data.drop(columns=['price','id','cut','color', 'clarity'])

# 1. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state=1)

# 2. LR Call & Fit
lr = LinearRegression()
lr.fit(X_train, y_train)

# 3. Prediction and outputs
train_y_pred = lr.predict(X_train)
test_y_pred = lr.predict(X_test)


print(f'\n train_y_pred:{train_y_pred}')
print(f'\n test_y_pred:{test_y_pred}')

#1.RMSE
# -train 
rmse_train = mean_squared_error(y_true= y_train, y_pred= train_y_pred, squared=False)
print(f'rmse_train:{rmse_train}')

# Plain Vanilla ML Regression model

# X and Y splitting

X = p_data.drop(columns=['id','cut','color', 'clarity'])

y_pred = lr.predict(X)





 train_y_pred:[6613.17213277  741.42472256 7199.99145705 ... 1401.04099596 8425.39094533
  960.48102513]

 test_y_pred:[  214.0648779  16870.23089717  1565.04808358 ...  3698.37328803
  1819.72192038  5730.42376739]
rmse_train:1502.3429346641055


## Model 2: Random Forest

In [6]:
# Import Model specific modules
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

# Calling Scaler
sc = StandardScaler()

# X and Y & Scalling
y = t_data['price']
X = sc.fit_transform(pd.get_dummies(t_data.drop(columns=['price','id'])))

# 1. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state=1)

# Fitting & Predicting in Train 
regressor = RandomForestRegressor(n_estimators=20, random_state=0)
regressor.fit(X_train, y_train)

# Predicting in Train
train_y_pred = regressor.predict(X_train)
test_y_pred = regressor.predict(X_test)
global_y_pred = regressor.predict(X)

# Evaluating Train Prediction
rmse_train = mean_squared_error(y_true= y_train, y_pred= train_y_pred, squared=False)
print(f'rmse_train:{rmse_train}')

rmse_test = mean_squared_error(y_true= y_test, y_pred= test_y_pred, squared=False)
print(f'rmse_test:{rmse_test}')

rmse_global = mean_squared_error(y_true= y, y_pred= global_y_pred, squared=False)
print(f'rmse_global:{rmse_global}')


# Generating output on predict.csv_data
regressor.fit(X, y)
regressor.fit(X, y)


X = sc.fit_transform(pd.get_dummies(t_data.drop(columns=['price','id'])))


y_pred = regressor.predict(X)

X = sc.fit_transform(pd.get_dummies(p_data.drop(columns=['id'])))


y_pred = regressor.predict(X)

rmse_train:227.6638111861182
rmse_test:557.9186600978602
rmse_global:322.0548012312642


In [7]:
# Looks like the model is overfitting

## Model 3: Lasso Linear Regression

In [8]:
from sklearn.linear_model import Lasso

# Calling Scaler
sc = StandardScaler()

# X and Y & Scalling
y = t_data['price']
X = sc.fit_transform(pd.get_dummies(t_data.drop(columns=['price','id'])))

# Train Test Split
X_train,X_test,y_train,y_test=train_test_split(X,y, test_size=0.3, random_state=31)

# Lasso Call & Fit
lasso = Lasso()
lasso.fit(X_train,y_train)

# RMSE
rmse_train=mean_squared_error(y_train,lasso.predict(X_train),squared=False)
print(f'rmse_train:{rmse_train}')

rmse_test=mean_squared_error(y_test,lasso.predict(X_test),squared=False)
print(f'rmse_test:{rmse_test}')




rmse_train:1139.5514861366867
rmse_test:1113.131335053029
