In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

def Database():
    """compile data sets into a data library, the output will be a DataFrame"""
    data_1 = pd.read_excel('data/Flash Point and Cetane Number Predictions for Fuel Compounds.xls', skiprows=3)
    data = data_1[['Name','Family', 'FP Exp.', 'CN Exp.']]
    result_1 = data.drop(index=0)
    result_1.reset_index(drop=True, inplace=True)
    data_2 = pd.read_excel('data/Flash Point and Cetane Number Predictions for Fuel Compounds.xls', skiprows=4)
    result_2 = data_2.loc[: , '-H': 'aaCa']
    result = pd.concat([result_1, result_2], axis=1)
    return result

In [2]:
df = Database()
df = df[np.isfinite(df['FP Exp.'])]

In [3]:
train, test = train_test_split(df, test_size=0.1)

In [None]:
### Ordinary Least Squares
reg = linear_model.LinearRegression()
train_X = train.loc[: , '-H': 'aaCa']
test_X = test.loc[: , '-H': 'aaCa']
reg.fit(train_X, train['FP Exp.'])
print('intercept', reg.intercept_, '\n', 'coef', reg.coef_)
train_predict = reg.predict(train_X)
test_predict = reg.predict(test_X)
plt.scatter(train['FP Exp.'], train_predict)
plt.scatter(test['FP Exp.'], test_predict, color='r')
plt.plot([200, 550], [200, 550], lw=4, color='black')

print("Train r^2", r2_score(train['FP Exp.'], train_predict))
print("Test r^2", r2_score(test['FP Exp.'], test_predict))
print("Train error", mean_squared_error(train['FP Exp.'], train_predict))
print("Test error", mean_squared_error(test['FP Exp.'], test_predict))

In [4]:
import matplotlib.pyplot as plt
import numpy as np

from sklearn import linear_model
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm
%matplotlib inline

  from pandas.core import datetools


In [None]:
train.head()

In [17]:
y = train['FP Exp.']
x = train.loc[: , '-H': 'aaCa']

In [18]:
mlr = sm.OLS(y, x).fit()  ###build linear regression model using StatsModels

In [19]:
mlr.summary()

  return self.params / self.bse
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


0,1,2,3
Dep. Variable:,FP Exp.,R-squared:,0.993
Model:,OLS,Adj. R-squared:,0.992
Method:,Least Squares,F-statistic:,3377.0
Date:,"Sat, 03 Mar 2018",Prob (F-statistic):,0.0
Time:,16:28:12,Log-Likelihood:,-2694.7
No. Observations:,567,AIC:,5433.0
Df Residuals:,545,BIC:,5529.0
Df Model:,22,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
-H,32.6174,0.346,94.363,0.000,31.938,33.296
-CH3,17.9978,1.430,12.586,0.000,15.189,20.807
-CH2-,-55.5754,0.878,-63.279,0.000,-57.301,-53.850
>CH-,-135.3812,3.447,-39.279,0.000,-142.152,-128.611
>C<,-210.7596,5.661,-37.228,0.000,-221.880,-199.639
=CH2,56.1497,2.595,21.636,0.000,51.052,61.247
=CH-,-31.6192,1.406,-22.485,0.000,-34.382,-28.857
=C<,-109.8484,5.593,-19.642,0.000,-120.834,-98.863
#CH,-4.56e-14,1.17e-14,-3.900,0.000,-6.86e-14,-2.26e-14

0,1,2,3
Omnibus:,110.138,Durbin-Watson:,1.938
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1122.367
Skew:,-0.509,Prob(JB):,1.91e-244
Kurtosis:,9.817,Cond. No.,3.31e+16


According to the result, we can see the F factor is large, which shows relation between Flash point and functional groups. According to t-test, #C- and -OH (phenol) is not good descriptors, so we decide to remove them.

In [None]:
train

In [7]:
train1 = train.drop(columns=['#C-', '-OH (phenol)'])

In [16]:
y1 = train1['FP Exp.']
x1 = train1.loc[: , '-H': 'aaCa']
mlr2 = sm.OLS(y1, x1).fit()
mlr2.summary()

  return self.params / self.bse
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


0,1,2,3
Dep. Variable:,FP Exp.,R-squared:,0.993
Model:,OLS,Adj. R-squared:,0.992
Method:,Least Squares,F-statistic:,3712.0
Date:,"Sat, 03 Mar 2018",Prob (F-statistic):,0.0
Time:,16:26:10,Log-Likelihood:,-2695.9
No. Observations:,567,AIC:,5432.0
Df Residuals:,547,BIC:,5519.0
Df Model:,20,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
-H,32.6492,0.344,94.877,0.000,31.973,33.325
-CH3,18.0850,1.426,12.679,0.000,15.283,20.887
-CH2-,-55.6565,0.874,-63.657,0.000,-57.374,-53.939
>CH-,-135.5352,3.442,-39.378,0.000,-142.296,-128.774
>C<,-211.5204,5.638,-37.516,0.000,-222.595,-200.445
=CH2,56.4823,2.584,21.855,0.000,51.406,61.559
=CH-,-31.7386,1.404,-22.600,0.000,-34.497,-28.980
=C<,-109.8739,5.592,-19.649,0.000,-120.858,-98.890
#CH,-1.739e-13,4.28e-15,-40.649,0.000,-1.82e-13,-1.65e-13

0,1,2,3
Omnibus:,105.686,Durbin-Watson:,1.948
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1057.764
Skew:,-0.474,Prob(JB):,2.04e-230
Kurtosis:,9.624,Cond. No.,3.3e+16
