In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
from sklearn import linear_model, svm, tree, model_selection, preprocessing, pipeline

In [3]:
df = pd.read_csv('compressive.csv')
X, y = np.split( df.to_numpy(), [3], axis=1)
scaled_X = preprocessing.StandardScaler().fit_transform(X,y)
normed_X = preprocessing.Normalizer().fit_transform(X,y)

In [96]:
y = y.flatten()

## Linear models

### Ordinary

In [138]:
lin_reg = linear_model.LinearRegression()
lin_pipe = pipeline.Pipeline(steps=[('polynomial_features',preprocessing.PolynomialFeatures()), ('linear_regression', lin_reg)])
lin_param_grid = { # not really any params to test for ordinary linear
    'polynomial_features__degree':[2,3,4,5]
}
lin_model = model_selection.GridSearchCV(lin_pipe, lin_param_grid, cv=model_selection.LeaveOneOut(), scoring="neg_mean_squared_error")

In [139]:
lin_model.fit(X, y)
lin_results = pd.DataFrame(lin_model.cv_results_)
lin_results # 2nd degree best

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_polynomial_features__degree,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,split7_test_score,split8_test_score,split9_test_score,split10_test_score,split11_test_score,split12_test_score,split13_test_score,mean_test_score,std_test_score,rank_test_score
0,0.000929,0.000457,0.000143,0.00035,2,{'polynomial_features__degree': 2},-305809.0,-401956.0,-964324.0,-66564.0,...,-341056.0,-322624.0,-38809.0,-758641.0,-2217121.0,-2601.0,-75625.0,-599754.2,620381.6,1
1,0.000572,0.000495,0.000357,0.000479,3,{'polynomial_features__degree': 3},-380689.0,-3364.0,-407044.0,-3481956.0,...,-17205900.0,-4466882.0,-3404025.0,-1979649.0,-109725600.0,-211600.0,-68306090.0,-18173510.0,31860810.0,2
2,0.000643,0.000479,0.000357,0.000479,4,{'polynomial_features__degree': 4},-149082.223818,-606355.829268,-780396.762122,-6674057.0,...,-8538238.0,-1448044.0,-24239.95,-400741.1,-90656240.0,-215762.004659,-96996060.0,-25731710.0,47705570.0,4
3,0.000786,0.00041,0.000286,0.000452,5,{'polynomial_features__degree': 5},-139903.681639,-558520.081382,-948165.30936,-8107877.0,...,-8266350.0,-1396355.0,-84370.62,-388632.3,-64122390.0,-198537.573979,-78339130.0,-20266280.0,36966620.0,3


In [109]:
print(lin_model.best_params_)
lin_model.best_estimator_.score(X, y)

{'polynomial_features__degree': 2}


0.8107183108801805

### Ridge

In [53]:
ridge_reg = linear_model.Ridge(max_iter=5000)
ridge_pipe = pipeline.Pipeline(steps=[('polynomial_features',preprocessing.PolynomialFeatures()), ('ridge', ridge_reg)])
ridge_param_grid = {
    'polynomial_features__degree':[2,3,4,5],
    'ridge__alpha':[10**k for k in range(0,-6,-1)]
}
ridge_model = model_selection.GridSearchCV(ridge_pipe, ridge_param_grid, cv=model_selection.LeaveOneOut(), scoring="neg_mean_absolute_error")

In [140]:
ridge_model.fit(scaled_X, y)
ridge_results = pd.DataFrame(ridge_model.cv_results_)
ridge_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_polynomial_features__degree,param_ridge__alpha,params,split0_test_score,split1_test_score,split2_test_score,...,split7_test_score,split8_test_score,split9_test_score,split10_test_score,split11_test_score,split12_test_score,split13_test_score,mean_test_score,std_test_score,rank_test_score
0,0.000714,0.000452,0.000214,0.000411,2,1.0,"{'polynomial_features__degree': 2, 'ridge__alp...",-503.173782,-675.765512,-908.153114,...,-483.942216,-555.243613,-201.308376,-923.578653,-1026.057235,-3.06253,-478.526801,-638.086426,337.102935,1
1,0.000643,0.000479,0.000286,0.000452,2,0.1,"{'polynomial_features__degree': 2, 'ridge__alp...",-556.217651,-635.591263,-976.738828,...,-581.359656,-567.594355,-198.994446,-892.617528,-1426.670351,-49.237731,-279.295922,-662.845489,386.080833,2
2,0.000786,0.00041,0.000286,0.000452,2,0.01,"{'polynomial_features__degree': 2, 'ridge__alp...",-562.120255,-630.951027,-984.59106,...,-591.618838,-569.419947,-199.014581,-888.789662,-1477.487596,-55.172666,-254.025843,-665.828707,394.684793,3
3,0.000714,0.000452,0.000286,0.000452,2,0.001,"{'polynomial_features__degree': 2, 'ridge__alp...",-562.717287,-630.479821,-985.387701,...,-592.650112,-569.60944,-199.019548,-888.398124,-1482.707159,-55.774216,-251.430904,-666.132784,395.594032,4
4,0.000643,0.000479,0.000214,0.00041,2,0.0001,"{'polynomial_features__degree': 2, 'ridge__alp...",-562.777059,-630.432627,-985.467481,...,-592.753293,-569.62846,-199.020074,-888.358881,-1483.230533,-55.834452,-251.170712,-666.163251,395.685457,5
5,0.000643,0.000479,0.000286,0.000452,2,1e-05,"{'polynomial_features__degree': 2, 'ridge__alp...",-562.783036,-630.427907,-985.47546,...,-592.763612,-569.630363,-199.020127,-888.354956,-1483.282885,-55.840477,-251.144686,-666.166298,395.694605,6
6,0.000643,0.000479,0.000357,0.000479,3,1.0,"{'polynomial_features__degree': 3, 'ridge__alp...",-652.591934,-540.019502,-798.55562,...,-1505.827878,-1256.666266,-99.38293,-1551.70579,-491.016492,-15.933458,-1239.296752,-894.528951,531.446265,7
7,0.000643,0.000479,0.000357,0.000479,3,0.1,"{'polynomial_features__degree': 3, 'ridge__alp...",-639.614926,-210.622584,-702.315567,...,-3220.278097,-1856.994203,-968.315637,-1451.877577,-3479.595788,-322.83496,-4466.869846,-1718.193165,1300.679467,15
8,0.000786,0.00041,0.000214,0.00041,3,0.01,"{'polynomial_features__degree': 3, 'ridge__alp...",-625.491144,-77.993459,-646.406224,...,-4014.147849,-2077.123443,-1696.992716,-1392.592624,-8862.290427,-458.006279,-7604.303013,-2659.43253,2722.396476,21
9,0.000857,0.00035,0.000143,0.00035,3,0.001,"{'polynomial_features__degree': 3, 'ridge__alp...",-623.639478,-60.049872,-638.699026,...,-4133.749293,-2109.857317,-1830.612822,-1384.347122,-10282.813701,-476.919075,-8192.855828,-2868.103662,3082.943093,22


In [84]:
print(ridge_model.best_params_)
ridge_model.best_estimator_.score(scaled_X, y) #R^2 of .81

{'polynomial_features__degree': 2, 'ridge__alpha': 1}


0.8105513730287419

### Lasso

In [69]:
lasso_reg = linear_model.Lasso(max_iter=5000)
lasso_pipe = pipeline.Pipeline(steps=[('polynomial_features',preprocessing.PolynomialFeatures()), ('lasso', ridge_reg)])
lasso_param_grid = {
    'polynomial_features__degree':[2,3,4,5],
    'lasso__alpha':[10**k for k in range(2,-6,-1)]
}
lasso_model = model_selection.GridSearchCV(lasso_pipe, lasso_param_grid, cv=model_selection.LeaveOneOut(), scoring="neg_mean_absolute_error")

In [75]:
lasso_model.fit(normed_X, y)
lasso_results = pd.DataFrame(lasso_model.cv_results_)
lasso_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_lasso__alpha,param_polynomial_features__degree,params,split0_test_score,split1_test_score,split2_test_score,...,split7_test_score,split8_test_score,split9_test_score,split10_test_score,split11_test_score,split12_test_score,split13_test_score,mean_test_score,std_test_score,rank_test_score
0,0.000572,0.000495,0.000214,0.000411,100.0,2,"{'lasso__alpha': 100, 'polynomial_features__de...",-202.338198,-901.426295,-408.20179,...,-1459.056234,-1295.334806,-932.574114,-917.70778,-1202.089422,-455.507724,-1447.214786,-928.319882,455.884751,20
1,0.000714,0.000452,0.000286,0.000452,100.0,3,"{'lasso__alpha': 100, 'polynomial_features__de...",-202.241744,-901.447398,-408.241034,...,-1458.945308,-1295.211547,-932.55897,-917.799275,-1202.237782,-455.483374,-1447.113628,-928.322809,455.875666,22
2,0.000714,0.000452,0.000214,0.000411,100.0,4,"{'lasso__alpha': 100, 'polynomial_features__de...",-202.1156,-901.469056,-408.292729,...,-1458.812201,-1295.068922,-932.546523,-917.896673,-1202.404677,-455.451559,-1446.99669,-928.32145,455.865291,21
3,0.000572,0.000495,0.000214,0.00041,100.0,5,"{'lasso__alpha': 100, 'polynomial_features__de...",-201.95122,-901.492173,-408.360322,...,-1458.649532,-1294.90002,-932.536976,-918.002892,-1202.597798,-455.410346,-1446.858433,-928.314955,455.853216,19
4,0.000714,0.000452,0.000286,0.000452,10.0,2,"{'lasso__alpha': 10, 'polynomial_features__deg...",-201.238412,-901.799726,-408.628318,...,-1457.490021,-1293.508775,-932.198766,-919.229114,-1204.259382,-455.233164,-1445.689154,-928.428333,455.729964,24
5,0.000572,0.000495,0.000286,0.000452,10.0,3,"{'lasso__alpha': 10, 'polynomial_features__deg...",-200.291852,-902.008275,-409.012161,...,-1456.389379,-1292.287622,-932.041132,-920.140357,-1205.713136,-454.993437,-1444.683061,-928.456323,455.643164,26
6,0.000572,0.000495,0.000357,0.000479,10.0,4,"{'lasso__alpha': 10, 'polynomial_features__deg...",-199.063063,-902.220845,-409.513711,...,-1455.075075,-1290.881924,-931.906716,-921.107981,-1207.333372,-454.682328,-1443.524837,-928.441666,455.547281,25
7,0.000714,0.000452,0.000214,0.00041,10.0,5,"{'lasso__alpha': 10, 'polynomial_features__deg...",-197.474503,-902.445796,-410.16388,...,-1453.478512,-1289.22781,-931.79645,-922.159437,-1209.189479,-454.282395,-1442.163165,-928.376637,455.44076,23
8,0.000714,0.000452,0.0,0.0,1.0,2,"{'lasso__alpha': 1, 'polynomial_features__degr...",-191.187163,-905.375992,-412.440595,...,-1442.267242,-1275.89451,-928.098939,-934.191135,-1224.162648,-452.67854,-1430.706382,-929.396148,454.386874,29
9,0.000608,0.00047,0.000357,0.000479,1.0,3,"{'lasso__alpha': 1, 'polynomial_features__degr...",-183.297977,-907.241237,-415.539025,...,-1432.054682,-1264.716446,-926.016658,-942.935282,-1236.08919,-450.610787,-1421.156377,-929.563368,453.857902,30


In [85]:
print(lasso_model.best_params_)
lasso_model.best_estimator_.score(normed_X, y) # .75-.8

{'lasso__alpha': 1e-05, 'polynomial_features__degree': 3}


0.7995097247824052

# SVR

In [177]:
svr = svm.SVR()
svr_param_grid = {
    'kernel': ['rbf'],
    'gamma': [10**k for k in range(-3,3)],
    'degree': [2,3,4],
    'C': [10**k for k in range(-2,3)],
    'epsilon': [10**k for k in range(-1,-5,-1)]   
}
svr_rbf_model = model_selection.GridSearchCV(svr, svr_param_grid, cv=model_selection.LeavePOut(2), scoring="r2")

In [178]:
svr_rbf_model.fit(scaled_X, y) # gonna take a hot minute
svr_results = pd.DataFrame(svr_model.cv_results_)
print(svr_rbf_model.best_params_)
best_svr_rbf = svr_rbf_model.best_estimator_
best_svr_rbf.score(scaled_X, y) # 0.21 lol

{'C': 100, 'degree': 2, 'epsilon': 0.1, 'gamma': 1, 'kernel': 'rbf'}


0.2177109097560339

In [159]:
svr = svm.SVR()
svr_param_grid = {
    'kernel': ['poly'],
    'gamma': [10**k for k in range(-2,2)],
    'degree': [2,3,4],
    'C': [10**k for k in range(-2,3)],
    'epsilon': [10**k for k in range(-1,-5,-1)]   
}
svr_model = model_selection.GridSearchCV(svr, svr_param_grid, cv=model_selection.LeaveOneOut(), scoring="neg_mean_absolute_error")

In [160]:
svr_model.fit(scaled_X, y) # gonna take a hot minute
svr_results = pd.DataFrame(svr_model.cv_results_)
print(svr_model.best_params_)
best_svr_poly = svr_model.best_estimator_
best_svr_poly.score(scaled_X, y) # 0.72

{'C': 10, 'degree': 2, 'epsilon': 0.001, 'gamma': 10, 'kernel': 'poly'}


0.7267242896242361

# Optimization

In [118]:
ratios_df = pd.read_csv('ratios.csv')
ratios_X = ratios_df.to_numpy()
scaled_ratios_X = preprocessing.StandardScaler().fit_transform(ratios_X)
normed_ratios_X = preprocessing.Normalizer().fit_transform(ratios_X)

In [120]:
ratios_df

Unnamed: 0,slag,gypsum,cement
0,0.70,0.20,0.10
1,0.70,0.21,0.09
2,0.70,0.22,0.08
3,0.70,0.23,0.07
4,0.70,0.24,0.06
...,...,...,...
68,0.80,0.16,0.04
69,0.80,0.17,0.03
70,0.81,0.15,0.04
71,0.81,0.16,0.03


In [121]:
best_lin = lin_model.best_estimator_
best_ridge = ridge_model.best_estimator_
best_lasso = lasso_model.best_estimator_
best_svr = svr_model.best_estimator_

In [129]:
lin_pred = best_lin.predict(ratios_X)
lin_results = ratios_df.copy()
lin_results['prediction'] = lin_pred
lin_results.sort_values(by='prediction', ascending=False)

Unnamed: 0,slag,gypsum,cement,prediction
70,0.81,0.15,0.04,12960.0
67,0.80,0.15,0.05,12932.0
72,0.82,0.15,0.03,12796.0
63,0.79,0.15,0.06,12712.0
64,0.79,0.16,0.05,12552.0
...,...,...,...,...
37,0.75,0.15,0.10,9952.0
6,0.71,0.19,0.10,9876.0
29,0.74,0.16,0.10,9840.0
13,0.72,0.18,0.10,9804.0


In [133]:
ridge_pred = best_ridge.predict(scaled_ratios_X) #very wet code
ridge_results = ratios_df.copy()
ridge_results['prediction'] = ridge_pred
ridge_results.sort_values(by='prediction', ascending=False)

Unnamed: 0,slag,gypsum,cement,prediction
67,0.80,0.15,0.05,12815.003776
70,0.81,0.15,0.04,12753.962522
63,0.79,0.15,0.06,12662.306594
72,0.82,0.15,0.03,12479.182831
64,0.79,0.16,0.05,12431.948337
...,...,...,...,...
6,0.71,0.19,0.10,9930.548554
37,0.75,0.15,0.10,9914.133503
13,0.72,0.18,0.10,9830.927553
29,0.74,0.16,0.10,9822.720027


In [136]:
lasso_pred = best_lasso.predict(normed_ratios_X) #very wet code
lasso_results = ratios_df.copy()
lasso_results['prediction'] = lasso_pred
lasso_results.sort_values(by='prediction', ascending=False)

Unnamed: 0,slag,gypsum,cement,prediction
70,0.81,0.15,0.04,12870.974278
72,0.82,0.15,0.03,12834.825238
67,0.80,0.15,0.05,12782.774237
63,0.79,0.15,0.06,12561.672126
68,0.80,0.16,0.04,12509.673606
...,...,...,...,...
0,0.70,0.20,0.10,10066.575247
29,0.74,0.16,0.10,10022.286037
6,0.71,0.19,0.10,9950.097850
21,0.73,0.17,0.10,9933.247023


In [137]:
svr_pred = best_svr.predict(scaled_ratios_X) #very wet code
svr_results = ratios_df.copy()
svr_results['prediction'] = svr_pred
svr_results.sort_values(by='prediction', ascending=False)

Unnamed: 0,slag,gypsum,cement,prediction
4,0.70,0.24,0.06,12658.748471
67,0.80,0.15,0.05,12637.802068
5,0.70,0.25,0.05,12587.404615
70,0.81,0.15,0.04,12530.852785
63,0.79,0.15,0.06,12504.859103
...,...,...,...,...
6,0.71,0.19,0.10,9834.744557
13,0.72,0.18,0.10,9648.075239
37,0.75,0.15,0.10,9574.164745
21,0.73,0.17,0.10,9542.422164


## MAE

In [141]:
from sklearn import metrics

In [169]:
print('Linear')
print('RMSE:',metrics.mean_squared_error(y,best_lin.predict(X),squared=False))
print('MAE:',metrics.mean_absolute_error(y,best_lin.predict(X)))
print('R2:',metrics.r2_score(y,best_lin.predict(X)))

Linear
RMSE: 417.81181683350496
MAE: 354.85714285714283
R2: 0.8107183108801805


In [171]:
print('Ridge')
print('RMSE:',metrics.mean_squared_error(y,best_ridge.predict(scaled_X),squared=False))
print('MAE:',metrics.mean_absolute_error(y,best_ridge.predict(scaled_X)))
print('R2:',metrics.r2_score(y,best_ridge.predict(scaled_X)))

Ridge
RMSE: 417.99602174876736
MAE: 356.28221867831115
R2: 0.8105513730287419


In [172]:
print('Lasso')
print('RMSE:',metrics.mean_squared_error(y,best_lasso.predict(normed_X),squared=False))
print('MAE:',metrics.mean_absolute_error(y,best_lasso.predict(normed_X)))
print('R2:',metrics.r2_score(y,best_lasso.predict(normed_X)))

Lasso
RMSE: 430.0045721738734
MAE: 356.07592589288004
R2: 0.7995097247824052


In [179]:
print('SVR RBF')
print('RMSE:',metrics.mean_squared_error(y,best_svr_rbf.predict(scaled_X),squared=False))
print('MAE:',metrics.mean_absolute_error(y,best_svr_rbf.predict(scaled_X)))
print('R2:',metrics.r2_score(y,best_svr_rbf.predict(scaled_X)))

SVR RBF
RMSE: 849.3956924329941
MAE: 747.4105694276384
R2: 0.2177109097560339


In [174]:
print('SVR Poly')
print('RMSE:',metrics.mean_squared_error(y,best_svr_poly.predict(scaled_X),squared=False))
print('MAE',metrics.mean_absolute_error(y,best_svr_poly.predict(scaled_X)))
print('R2',metrics.r2_score(y,best_svr_poly.predict(scaled_X)))

SVR Poly
RMSE: 502.02683627078585
MAE 362.9266730715722
R2 0.7267242896242361
