In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split,GridSearchCV
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score,explained_variance_score

# read in the excel file
df = pd.read_excel('data/Pumpkin_Seeds_Dataset.xlsx')
df.head(5)

Unnamed: 0,Area,Perimeter,Major_Axis_Length,Minor_Axis_Length,Convex_Area,Equiv_Diameter,Eccentricity,Solidity,Extent,Roundness,Aspect_Ration,Compactness,Class
0,56276,888.242,326.1485,220.2388,56831,267.6805,0.7376,0.9902,0.7453,0.8963,1.4809,0.8207,Çerçevelik
1,76631,1068.146,417.1932,234.2289,77280,312.3614,0.8275,0.9916,0.7151,0.844,1.7811,0.7487,Çerçevelik
2,71623,1082.987,435.8328,211.0457,72663,301.9822,0.8749,0.9857,0.74,0.7674,2.0651,0.6929,Çerçevelik
3,66458,992.051,381.5638,222.5322,67118,290.8899,0.8123,0.9902,0.7396,0.8486,1.7146,0.7624,Çerçevelik
4,66107,998.146,383.8883,220.4545,67117,290.1207,0.8187,0.985,0.6752,0.8338,1.7413,0.7557,Çerçevelik


In [4]:
# prepare for experiment, only numeric X's
numeric_cols = df.columns[df.dtypes != 'object']

# drop y
numeric_cols = numeric_cols.drop('Perimeter')



# drop X cols with nulls
cols_with_nulls = numeric_cols[df[numeric_cols].isnull().sum(axis=0) > 0]
numeric_cols = numeric_cols.drop(cols_with_nulls)

cols = numeric_cols.to_list()




#getting dummies for y-class
y = df['Perimeter']
y = pd.get_dummies(y, drop_first=True)
X = df[cols]

# X.head(2),y.head(2)
cols


['Area',
 'Major_Axis_Length',
 'Minor_Axis_Length',
 'Convex_Area',
 'Equiv_Diameter',
 'Eccentricity',
 'Solidity',
 'Extent',
 'Roundness',
 'Aspect_Ration',
 'Compactness']

In [5]:
#spliting for train/test

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size =0.3)

In [6]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((1750, 11), (750, 11), (1750, 2489), (750, 2489))

In [7]:
#create the random forest model
model =  RandomForestRegressor(n_estimators = 100, random_state =42) 



In [8]:
param_grid = {'max_depth': [4, 10, 15, None],
              'max_features': ['auto', 'sqrt'],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 2, 4]}

In [9]:
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=2,verbose=2, n_jobs=-1)

grid_search.fit(X_train, y_train)

Fitting 2 folds for each of 72 candidates, totalling 144 fits


In [10]:
print("Best hyperparameters:", grid_search.best_params_)

print("Best score:", grid_search.best_score_)

Best hyperparameters: {'max_depth': 4, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5}
Best score: 0.30012205567162886


In [None]:
#fitting the model
# fitModel = model.fit(X_train,y_train)


# .values will give the values in a numpy array (shape: (n,1))

# .ravel will convert that array shape to (n, ) (i.e. flatten it)

fitModel = model.fit(X_train,y_train.values.ravel())

### Doing Metrics

In [None]:
#The training r_sq
print("score:%.2f" %fitModel.score(X_train,y_train))

In [None]:
#Prediction on training dataset
y_trainPred = fitModel.predict(X_train)
print("mae: %.2f" %mean_absolute_error(y_train,y_trainPred))

In [None]:
#r2
print("R_sq:%.2f" %r2_score(y_train,y_trainPred))

In [None]:
print("mse: %.2f" %mean_squared_error(y_train,y_trainPred))

In [None]:
#explained variance score
print("evs: %.2f" %explained_variance_score(y_train,y_trainPred))

### Predictions on Test data

In [None]:
y_testPred = fitModel.predict(X_test)
y_testPred[:10]

In [None]:
print("R_sq:%.2f" %r2_score(y_test,y_testPred))

In [None]:
print("mse: %.2f" %mean_squared_error(y_test,y_testPred))

In [None]:
print("rmse: %.2f" %np.sqrt(mean_squared_error(y_test,y_testPred)))

In [None]:
print("evs: %.2f" %explained_variance_score(y_test,y_testPred))

### Visualizations of Observed and Predicted


In [None]:
#setting boundaries and parameters
plt.rcParams['figure.figsize'] = (10,6)
x_ax = range(len(X_test))
# plt.plot( linestyle='--', marker='o', color='b', label='line with marker')

#plotting
plt.scatter(x =y_test,y= y_testPred,label ="P vs O",color = '--ob')
# plt.(x_ax,y_testPred,label='Predicted',marker = 'o',color = 'orange',linestyle='-')



# # Add labels to your graph
plt.xlabel('Observed')
plt.ylabel('Predictions')
plt.title("Observed vs Predict")
plt.legend()
plt.show()


In [None]:
plt.figure(figsize  = (10,10))
data = X.corr()
sns.heatmap(data, annot = True)

In [None]:
param_grid = {'max_depth': [2, 5, 10, None],

              'max_features': ['auto', 'sqrt'],

              'min_samples_split': [2, 5, 10],

              'min_samples_leaf': [1, 2, 4]}




grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=5)

grid_search.fit(X_train, y_train)




print("Best hyperparameters:", grid_search.best_params_)

print("Best score:", grid_search.best_score_)