In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn import linear_model, svm, tree, model_selection, preprocessing, pipeline

In [3]:
df = pd.read_csv('compressive.csv')
X, y = np.split( df.to_numpy(), [3], axis=1)
scaled_X = preprocessing.StandardScaler().fit_transform(X,y)
normed_X = preprocessing.Normalizer().fit_transform(X,y)

In [4]:
y = y.flatten()

## Linear models

### Ordinary

In [5]:
lin_reg = linear_model.LinearRegression()
lin_pipe = pipeline.Pipeline(steps=[('polynomial_features',preprocessing.PolynomialFeatures()), ('linear_regression', lin_reg)])
lin_param_grid = { # not really any params to test for ordinary linear
    'polynomial_features__degree':[2,3,4,5]
}
lin_model = model_selection.GridSearchCV(lin_pipe, lin_param_grid, cv=model_selection.LeaveOneOut(), scoring="neg_mean_squared_error")

In [6]:
lin_model.fit(X, y)
lin_results = pd.DataFrame(lin_model.cv_results_)
lin_results # 2nd degree best

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_polynomial_features__degree,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,split7_test_score,split8_test_score,split9_test_score,split10_test_score,split11_test_score,split12_test_score,split13_test_score,mean_test_score,std_test_score,rank_test_score
0,0.00158,0.000583,0.000684,0.000177,2,{'polynomial_features__degree': 2},-342225.0,-343396.0,-996004.0,-70756.0,...,-341056.0,-341056.0,-38809.0,-829921.0,-2193361.0,-3025.0,-63001.0,-605832.5,622903.1,1
1,0.001951,0.000825,0.000952,0.000522,3,{'polynomial_features__degree': 3},-400689.0,-2500.0,-417316.0,-3478225.0,...,-17139600.0,-4468996.0,-3404025.0,-2070721.0,-109641800.0,-211600.0,-68301960.0,-18190970.0,31845260.0,2
2,0.001143,0.00026,0.00052,0.000124,4,{'polynomial_features__degree': 4},-149067.774417,-606329.702286,-780390.88864,-6674046.0,...,-8538253.0,-1448089.0,-24095.52,-400740.7,-90651390.0,-215762.759055,-96996030.0,-25731350.0,47705100.0,4
3,0.001014,1.7e-05,0.000466,2e-06,5,{'polynomial_features__degree': 5},-139905.253641,-558524.101146,-948161.557576,-8107881.0,...,-8266350.0,-1396346.0,-84373.02,-388632.4,-64122350.0,-198537.59899,-78442250.0,-20273650.0,36978220.0,3


In [7]:
print(lin_model.best_params_)
lin_model.best_estimator_.score(X, y)

{'polynomial_features__degree': 2}


0.8121464813411543

### Ridge

In [8]:
ridge_reg = linear_model.Ridge(max_iter=5000)
ridge_pipe = pipeline.Pipeline(steps=[('polynomial_features',preprocessing.PolynomialFeatures()), ('ridge', ridge_reg)])
ridge_param_grid = {
    'polynomial_features__degree':[2,3,4,5],
    'ridge__alpha':[10**k for k in range(0,-6,-1)]
}
ridge_model = model_selection.GridSearchCV(ridge_pipe, ridge_param_grid, cv=model_selection.LeaveOneOut(), scoring="neg_mean_absolute_error")

In [9]:
ridge_model.fit(scaled_X, y)
ridge_results = pd.DataFrame(ridge_model.cv_results_)
ridge_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_polynomial_features__degree,param_ridge__alpha,params,split0_test_score,split1_test_score,split2_test_score,...,split7_test_score,split8_test_score,split9_test_score,split10_test_score,split11_test_score,split12_test_score,split13_test_score,mean_test_score,std_test_score,rank_test_score
0,0.002767,0.002373,0.000864,0.00028,2,1.0,"{'polynomial_features__degree': 2, 'ridge__alp...",-503.173782,-675.765512,-908.153114,...,-483.942216,-555.243613,-201.308376,-923.578653,-1026.057235,-3.06253,-478.526801,-638.086426,337.102935,1
1,0.001566,0.000747,0.000592,0.000178,2,0.1,"{'polynomial_features__degree': 2, 'ridge__alp...",-556.217651,-635.591263,-976.738828,...,-581.359656,-567.594355,-198.994446,-892.617528,-1426.670351,-49.237731,-279.295922,-662.845489,386.080833,2
2,0.000998,3.9e-05,0.000434,7e-06,2,0.01,"{'polynomial_features__degree': 2, 'ridge__alp...",-562.120255,-630.951027,-984.59106,...,-591.618838,-569.419947,-199.014581,-888.789662,-1477.487596,-55.172666,-254.025843,-665.828707,394.684793,3
3,0.001222,0.000653,0.000535,0.000254,2,0.001,"{'polynomial_features__degree': 2, 'ridge__alp...",-562.717287,-630.479821,-985.387701,...,-592.650112,-569.60944,-199.019548,-888.398124,-1482.707159,-55.774216,-251.430904,-666.132784,395.594032,4
4,0.001118,0.000236,0.000501,0.000195,2,0.0001,"{'polynomial_features__degree': 2, 'ridge__alp...",-562.777059,-630.432627,-985.467481,...,-592.753293,-569.62846,-199.020074,-888.358881,-1483.230533,-55.834452,-251.170712,-666.163251,395.685457,5
5,0.001047,0.0001,0.000454,4e-05,2,1e-05,"{'polynomial_features__degree': 2, 'ridge__alp...",-562.783036,-630.427907,-985.47546,...,-592.763612,-569.630363,-199.020127,-888.354956,-1483.282885,-55.840477,-251.144686,-666.166298,395.694605,6
6,0.00106,6.9e-05,0.000456,2.8e-05,3,1.0,"{'polynomial_features__degree': 3, 'ridge__alp...",-652.591934,-540.019502,-798.55562,...,-1505.827878,-1256.666266,-99.38293,-1551.70579,-491.016492,-15.933458,-1239.296752,-894.528951,531.446265,7
7,0.00104,5.4e-05,0.000449,1.4e-05,3,0.1,"{'polynomial_features__degree': 3, 'ridge__alp...",-639.614926,-210.622584,-702.315567,...,-3220.278097,-1856.994203,-968.315637,-1451.877577,-3479.595788,-322.83496,-4466.869846,-1718.193165,1300.679467,15
8,0.001044,5.8e-05,0.000445,8e-06,3,0.01,"{'polynomial_features__degree': 3, 'ridge__alp...",-625.491144,-77.993459,-646.406224,...,-4014.147849,-2077.123443,-1696.992716,-1392.592624,-8862.290427,-458.006279,-7604.303013,-2659.43253,2722.396476,21
9,0.00106,8.4e-05,0.000455,3.4e-05,3,0.001,"{'polynomial_features__degree': 3, 'ridge__alp...",-623.639478,-60.049872,-638.699026,...,-4133.749293,-2109.857317,-1830.612822,-1384.347122,-10282.813701,-476.919075,-8192.855828,-2868.103662,3082.943093,22


In [10]:
print(ridge_model.best_params_)
ridge_model.best_estimator_.score(scaled_X, y) #R^2 of .81

{'polynomial_features__degree': 2, 'ridge__alpha': 1}


0.8105513730287419

### Lasso

In [11]:
lasso_reg = linear_model.Lasso(max_iter=5000)
lasso_pipe = pipeline.Pipeline(steps=[('polynomial_features',preprocessing.PolynomialFeatures()), ('lasso', ridge_reg)])
lasso_param_grid = {
    'polynomial_features__degree':[2,3,4,5],
    'lasso__alpha':[10**k for k in range(2,-6,-1)]
}
lasso_model = model_selection.GridSearchCV(lasso_pipe, lasso_param_grid, cv=model_selection.LeaveOneOut(), scoring="neg_mean_absolute_error")

In [12]:
lasso_model.fit(normed_X, y)
lasso_results = pd.DataFrame(lasso_model.cv_results_)
lasso_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_lasso__alpha,param_polynomial_features__degree,params,split0_test_score,split1_test_score,split2_test_score,...,split7_test_score,split8_test_score,split9_test_score,split10_test_score,split11_test_score,split12_test_score,split13_test_score,mean_test_score,std_test_score,rank_test_score
0,0.001764,0.00032,0.000813,0.000318,100.0,2,"{'lasso__alpha': 100, 'polynomial_features__de...",-202.338198,-901.426295,-408.20179,...,-1459.056234,-1295.334806,-932.574114,-917.70778,-1202.089422,-455.507724,-1447.214786,-928.319882,455.884751,20
1,0.002104,0.000504,0.000924,0.000237,100.0,3,"{'lasso__alpha': 100, 'polynomial_features__de...",-202.241744,-901.447398,-408.241034,...,-1458.945308,-1295.211547,-932.55897,-917.799275,-1202.237782,-455.483374,-1447.113628,-928.322809,455.875666,22
2,0.001481,0.000699,0.000588,0.000264,100.0,4,"{'lasso__alpha': 100, 'polynomial_features__de...",-202.1156,-901.469056,-408.292729,...,-1458.812201,-1295.068922,-932.546523,-917.896673,-1202.404677,-455.451559,-1446.99669,-928.32145,455.865291,21
3,0.00107,8.7e-05,0.000478,5.3e-05,100.0,5,"{'lasso__alpha': 100, 'polynomial_features__de...",-201.95122,-901.492173,-408.360322,...,-1458.649532,-1294.90002,-932.536976,-918.002892,-1202.597798,-455.410346,-1446.858433,-928.314955,455.853216,19
4,0.000976,2.2e-05,0.000439,2.2e-05,10.0,2,"{'lasso__alpha': 10, 'polynomial_features__deg...",-201.238412,-901.799726,-408.628318,...,-1457.490021,-1293.508775,-932.198766,-919.229114,-1204.259382,-455.233164,-1445.689154,-928.428333,455.729964,24
5,0.001027,4.7e-05,0.000445,1.2e-05,10.0,3,"{'lasso__alpha': 10, 'polynomial_features__deg...",-200.291852,-902.008275,-409.012161,...,-1456.389379,-1292.287622,-932.041132,-920.140357,-1205.713136,-454.993437,-1444.683061,-928.456323,455.643164,26
6,0.001023,1.6e-05,0.000465,5.4e-05,10.0,4,"{'lasso__alpha': 10, 'polynomial_features__deg...",-199.063063,-902.220845,-409.513711,...,-1455.075075,-1290.881924,-931.906716,-921.107981,-1207.333372,-454.682328,-1443.524837,-928.441666,455.547281,25
7,0.00104,1.2e-05,0.000462,1e-05,10.0,5,"{'lasso__alpha': 10, 'polynomial_features__deg...",-197.474503,-902.445796,-410.16388,...,-1453.478512,-1289.22781,-931.79645,-922.159437,-1209.189479,-454.282395,-1442.163165,-928.376637,455.44076,23
8,0.000961,1.5e-05,0.000434,2.2e-05,1.0,2,"{'lasso__alpha': 1, 'polynomial_features__degr...",-191.187163,-905.375992,-412.440595,...,-1442.267242,-1275.89451,-928.098939,-934.191135,-1224.162648,-452.67854,-1430.706382,-929.396148,454.386874,29
9,0.001073,0.000131,0.000448,1.5e-05,1.0,3,"{'lasso__alpha': 1, 'polynomial_features__degr...",-183.297977,-907.241237,-415.539025,...,-1432.054682,-1264.716446,-926.016658,-942.935282,-1236.08919,-450.610787,-1421.156377,-929.563368,453.857902,30


In [13]:
print(lasso_model.best_params_)
lasso_model.best_estimator_.score(normed_X, y) # .75-.8

{'lasso__alpha': 1e-05, 'polynomial_features__degree': 3}


0.7995097247824052

# SVR

In [14]:
svr = svm.SVR()
svr_param_grid = {
    'kernel': ['rbf'],
    'gamma': [10**k for k in range(-4,4)],
    'degree': [2],
    'C': [10**k for k in range(-2,3)],
    'epsilon': [10**k for k in range(-1,-5,-1)]   
}
svr_rbf_model = model_selection.GridSearchCV(svr, svr_param_grid, cv=model_selection.LeavePOut(2), scoring="r2")

In [15]:
svr_rbf_model.fit(scaled_X, y) # gonna take a hot minute
#svr_results = pd.DataFrame(svr_rbf_model.cv_results_)
print(svr_rbf_model.best_params_)
best_svr_rbf = svr_rbf_model.best_estimator_
best_svr_rbf.score(scaled_X, y) # 0.21 lol

{'C': 100, 'degree': 2, 'epsilon': 0.1, 'gamma': 1, 'kernel': 'rbf'}


0.2177109097560339

In [16]:
svr = svm.SVR()
svr_param_grid = {
    'kernel': ['poly'],
    'gamma': [10**k for k in range(-2,2)],
    'degree': [2,3,4],
    'C': [10**k for k in range(-2,3)],
    'epsilon': [10**k for k in range(-1,-5,-1)]   
}
svr_poly_model = model_selection.GridSearchCV(svr, svr_param_grid, cv=model_selection.LeaveOneOut(), scoring="neg_mean_absolute_error")

In [17]:
svr_poly_model.fit(scaled_X, y) # gonna take a hot minute
#svr_results = pd.DataFrame(svr_poly_model.cv_results_)
print(svr_poly_model.best_params_)
best_svr_poly = svr_poly_model.best_estimator_
best_svr_poly.score(scaled_X, y) # 0.72

{'C': 10, 'degree': 2, 'epsilon': 0.001, 'gamma': 10, 'kernel': 'poly'}


0.7267242896242361

# Optimization

In [18]:
ratios_df = pd.read_csv('ratios_thousandths.csv', names=['slag','gypsum','cement'])
ratios_X = ratios_df.to_numpy()
scaled_ratios_X = preprocessing.StandardScaler().fit_transform(ratios_X)
normed_ratios_X = preprocessing.Normalizer().fit_transform(ratios_X)

In [19]:
ratios_df

Unnamed: 0,slag,gypsum,cement
0,0.700,0.200,0.100
1,0.700,0.201,0.099
2,0.700,0.202,0.098
3,0.700,0.203,0.097
4,0.700,0.204,0.096
...,...,...,...
5891,0.818,0.151,0.031
5892,0.818,0.152,0.030
5893,0.819,0.150,0.031
5894,0.819,0.151,0.030


In [22]:
best_lin = lin_model.best_estimator_
best_ridge = ridge_model.best_estimator_
best_lasso = lasso_model.best_estimator_
best_svr = svr_poly_model.best_estimator_

In [23]:
lin_pred = best_lin.predict(ratios_X)
lin_results = ratios_df.copy()
lin_results['predicted psi'] = lin_pred
lin_results.sort_values(by='predicted psi', ascending=False)

Unnamed: 0,slag,gypsum,cement,predicted psi
5818,0.809,0.150,0.041,12964.0
5791,0.807,0.150,0.043,12964.0
5776,0.806,0.150,0.044,12960.0
5760,0.805,0.150,0.045,12960.0
5805,0.808,0.150,0.042,12956.0
...,...,...,...,...
1778,0.728,0.172,0.100,9784.0
1920,0.730,0.170,0.100,9784.0
1991,0.731,0.169,0.100,9784.0
1636,0.726,0.174,0.100,9784.0


In [25]:
ridge_pred = best_ridge.predict(scaled_ratios_X) #very wet code
ridge_results = ratios_df.copy()
ridge_results['predicted psi'] = ridge_pred
ridge_results.sort_values(by='predicted psi', ascending=False)

Unnamed: 0,slag,gypsum,cement,predicted psi
5686,0.801,0.150,0.049,13010.969660
5706,0.802,0.150,0.048,13010.408482
5665,0.800,0.150,0.050,13008.864670
5725,0.803,0.150,0.047,13007.181136
5643,0.799,0.150,0.051,13004.093510
...,...,...,...,...
1849,0.729,0.171,0.100,9370.846047
2133,0.733,0.167,0.100,9370.133731
1920,0.730,0.170,0.100,9369.515499
2062,0.732,0.168,0.100,9369.159341


In [26]:
lasso_pred = best_lasso.predict(normed_ratios_X) #very wet code
lasso_results = ratios_df.copy()
lasso_results['predicted psi'] = lasso_pred
lasso_results.sort_values(by='predicted psi', ascending=False)

Unnamed: 0,slag,gypsum,cement,predicted psi
5851,0.812,0.150,0.038,12873.423341
5841,0.811,0.150,0.039,12872.816238
5860,0.813,0.150,0.037,12872.803970
5830,0.810,0.150,0.040,12870.974278
5868,0.814,0.150,0.036,12870.966478
...,...,...,...,...
1140,0.719,0.181,0.100,9908.528354
1423,0.723,0.177,0.100,9908.226036
1210,0.720,0.180,0.100,9907.430912
1352,0.722,0.178,0.100,9907.285050


In [27]:
svr_pred = best_svr.predict(scaled_ratios_X) #very wet code
svr_results = ratios_df.copy()
svr_results['predicted psi'] = svr_pred
svr_results.sort_values(by='predicted psi', ascending=False)

Unnamed: 0,slag,gypsum,cement,predicted psi
5665,0.800,0.150,0.050,12824.052192
5643,0.799,0.150,0.051,12823.085906
5686,0.801,0.150,0.049,12822.036097
42,0.700,0.242,0.058,12821.997911
41,0.700,0.241,0.059,12820.334688
...,...,...,...,...
2275,0.735,0.165,0.100,9047.771701
2559,0.739,0.161,0.100,9046.183663
2346,0.736,0.164,0.100,9045.910945
2488,0.738,0.162,0.100,9045.116926


## Optimize using coefficients

In [33]:
best_lin.named_steps['polynomial_features'].powers_

array([[0, 0, 0],
       [1, 0, 0],
       [0, 1, 0],
       [0, 0, 1],
       [2, 0, 0],
       [1, 1, 0],
       [1, 0, 1],
       [0, 2, 0],
       [0, 1, 1],
       [0, 0, 2]], dtype=int64)

In [30]:
best_lin.named_steps['linear_regression'].coef_

array([ 0.00000000e+00,  5.31918641e+16,  3.20530445e+16,  1.27808151e+16,
       -2.63715650e+16, -3.16043105e+16, -1.23320810e+16, -5.23274544e+15,
        8.80673859e+15,  1.40394840e+16])

In [34]:
best_ridge.named_steps['polynomial_features'].powers_

array([[0, 0, 0],
       [1, 0, 0],
       [0, 1, 0],
       [0, 0, 1],
       [2, 0, 0],
       [1, 1, 0],
       [1, 0, 1],
       [0, 2, 0],
       [0, 1, 1],
       [0, 0, 2]], dtype=int64)

In [32]:
best_ridge.named_steps['ridge'].coef_

array([   0.        ,  118.58267342, -129.74631374,   32.18547591,
        101.44385246, -190.71459712,  135.30656095,    0.99130658,
        228.99231599, -472.98499073])

In [None]:
import scipy

In [38]:
normed_X

array([[0.94925348, 0.31208334, 0.03901042],
       [0.96241971, 0.26011343, 0.07803403],
       [0.96840725, 0.23241774, 0.09038468],
       [0.97683328, 0.20037606, 0.07514102],
       [0.93961848, 0.33557803, 0.06711561],
       [0.94394271, 0.32775789, 0.03933095],
       [0.95886293, 0.28126646, 0.03835452],
       [0.98102294, 0.1839418 , 0.06131393],
       [0.97642545, 0.21011687, 0.04943926],
       [0.9640377 , 0.23770792, 0.11885396],
       [0.96719626, 0.25121981, 0.03768297],
       [0.97230559, 0.19446112, 0.12964074],
       [0.96647559, 0.24484048, 0.07731805],
       [0.95257934, 0.27216553, 0.13608276]])

In [35]:
best_lasso.named_steps['polynomial_features'].powers_

array([[0, 0, 0],
       [1, 0, 0],
       [0, 1, 0],
       [0, 0, 1],
       [2, 0, 0],
       [1, 1, 0],
       [1, 0, 1],
       [0, 2, 0],
       [0, 1, 1],
       [0, 0, 2],
       [3, 0, 0],
       [2, 1, 0],
       [2, 0, 1],
       [1, 2, 0],
       [1, 1, 1],
       [1, 0, 2],
       [0, 3, 0],
       [0, 2, 1],
       [0, 1, 2],
       [0, 0, 3]], dtype=int64)

In [36]:
best_lasso.named_steps['lasso'].coef_

array([      0.        ,   17026.40764274,    3110.85845006,
         19536.61837338,   40740.55302584,  -21360.67822699,
          1112.85565109,   99797.39606905,  104371.63292165,
       -140537.94909489,   70336.16257551,  -42155.50772207,
        -14941.77587431,   83618.92918907,   92155.30760384,
       -136928.68412186,   70257.64567564,   65810.67197098,
        -24991.27950351,  -31332.27772329])

## MAE

In [141]:
from sklearn import metrics

In [169]:
print('Linear')
print('RMSE:',metrics.mean_squared_error(y,best_lin.predict(X),squared=False))
print('MAE:',metrics.mean_absolute_error(y,best_lin.predict(X)))
print('R2:',metrics.r2_score(y,best_lin.predict(X)))

Linear
RMSE: 417.81181683350496
MAE: 354.85714285714283
R2: 0.8107183108801805


In [171]:
print('Ridge')
print('RMSE:',metrics.mean_squared_error(y,best_ridge.predict(scaled_X),squared=False))
print('MAE:',metrics.mean_absolute_error(y,best_ridge.predict(scaled_X)))
print('R2:',metrics.r2_score(y,best_ridge.predict(scaled_X)))

Ridge
RMSE: 417.99602174876736
MAE: 356.28221867831115
R2: 0.8105513730287419


In [172]:
print('Lasso')
print('RMSE:',metrics.mean_squared_error(y,best_lasso.predict(normed_X),squared=False))
print('MAE:',metrics.mean_absolute_error(y,best_lasso.predict(normed_X)))
print('R2:',metrics.r2_score(y,best_lasso.predict(normed_X)))

Lasso
RMSE: 430.0045721738734
MAE: 356.07592589288004
R2: 0.7995097247824052


In [179]:
print('SVR RBF')
print('RMSE:',metrics.mean_squared_error(y,best_svr_rbf.predict(scaled_X),squared=False))
print('MAE:',metrics.mean_absolute_error(y,best_svr_rbf.predict(scaled_X)))
print('R2:',metrics.r2_score(y,best_svr_rbf.predict(scaled_X)))

SVR RBF
RMSE: 849.3956924329941
MAE: 747.4105694276384
R2: 0.2177109097560339


In [174]:
print('SVR Poly')
print('RMSE:',metrics.mean_squared_error(y,best_svr_poly.predict(scaled_X),squared=False))
print('MAE',metrics.mean_absolute_error(y,best_svr_poly.predict(scaled_X)))
print('R2',metrics.r2_score(y,best_svr_poly.predict(scaled_X)))

SVR Poly
RMSE: 502.02683627078585
MAE 362.9266730715722
R2 0.7267242896242361
