In [1]:
#%% IMPORTS
import pandas as pd
pd.options.display.max_columns = None
import numpy as np
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
import seaborn as sns
from scipy.stats import normaltest
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from IPython.display import display, Markdown, Latex
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import GridSearchCV 
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score
from sklearn.ensemble import ExtraTreesRegressor

## Methods

In [2]:
#Eliminate Outliers based on the interquantile
#datFrame: Data frame where the outliers will be eliminated.
#columnName: the name of the column where the outliers will be identified.
def eliminateOutliers (dataFrame, columnName):
    Q1 = dataFrame[columnName].quantile(0.25)
    Q3 = dataFrame[columnName].quantile(0.75)
    IQR = Q3 - Q1
    print('Initial dataframe size: '+str(dataFrame.shape))
    dataFrame = dataFrame[(dataFrame[columnName] < (Q3 + 1.5 * IQR)) & (dataFrame[columnName] > (Q1 - 1.5 * IQR))]
    print('Final dataframe size: '+str(dataFrame.shape))
    return dataFrame

In [3]:
# Create the boxplot graphs for the categorical variables
# dataFrame: Data frame associated to the property of interest (dfAirVoids, dfMS, dfMF, dfITS, dfTSR)
# propertyOfInterest: the name of the column where the property of interest is located.
# columnName1...4: The categorical columns to evaluate.
def displayBoxPlotGraphs (dataFrame, propertyOfInterest, columnName1, columnName2, columnName3, columnName4):
    f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15,10))
    sns.boxplot(y = propertyOfInterest, x = columnName1, data=dataFrame,  orient='v' , ax=ax1)
    sns.boxplot(y = propertyOfInterest, x = columnName2, data=dataFrame,  orient='v' , ax=ax2)
    sns.boxplot(y = propertyOfInterest, x= columnName3, data=dataFrame,  orient='v' , ax=ax3)
    sns.boxplot(y= propertyOfInterest, x= columnName4, data=dataFrame,  orient='v' , ax=ax4)

##  Data Import 

In [5]:
#%%DATA READING AND INITIAL PREPROCESSING
numericColumns = ['Aggregate absorption [%]',
                  'Apparent specific gravity',
                    0.075,
                    0.3,
                    0.6,
                    2.36,
                    4.75,
                    9.5,
                    12.5,
                    19,
                    'Plastic particle size (mm)',
                    'Mixing speed (RPM)',
                    'Mixing Temperature',
                    'Mixing Time (hours)',
                    'Plastic Addition by bitumen weight (%)',
                    'Bitumen content in the sample'
                    ]
categoricalColumns = ['Modified asphalt Mix?',
                      'Agreggate Type',
                    'Aggregate absorption [%]',
                    'Filler used',
                    'Consolidated bitumen penetration grade',
                    'New Plastic Type',
                    'Plastic pretreatment',
                    'Plastic shape',
                    'Plastic Size',
                    'Mixing Process',
                    'Plastic melted previous to addition?',
                    'Aggregates replacement ?',
                    'Bitumen replacement?',
                    'Filler replacement',
                    'Property',
                    'Units']
#It returns the dataframe of interes based on the property - 'AirVoids', 'MS', 'MF', 'ITS', 'TSR'
def returnDf (propertyOfInterest):
    df = pd.read_excel('fileML.xlsx', sheet_name = propertyOfInterest, engine='openpyxl')
    df = df.set_index(propertyOfInterest + ' ID')
    df.loc[:,:'Units'] = df.loc[:,:'Units'].applymap(str)
    df.loc[:,:'Units'] = df.loc[:,:'Units'] .applymap(str.strip)
    df.replace('NS', np.nan, inplace = True)
    df[numericColumns] = df[numericColumns].replace('N/a', 0).astype(float)
    return df

In [6]:
dfMS = returnDf('MS')

## 1 Data Exploration
###  1.1 Total Sample

In [7]:
dfMS = eliminateOutliers(dfMS, 'MS of the sample (kN)')

Initial dataframe size: (406, 35)
Final dataframe size: (402, 35)


In [8]:
dfMS.iloc[:,2:].describe(include = 'all')

Unnamed: 0,Modified asphalt Mix?,Agreggate Type,Aggregate absorption [%],Apparent specific gravity,0.075,0.3,0.6,2.36,4.75,9.5,12.5,19,Filler used,Bitumen Type Penetration Grade,Consolidated bitumen penetration grade,New Plastic Type,Plastic pretreatment,Plastic shape,Plastic Size,Plastic particle size (mm),Mixing Process,Plastic melted previous to addition?,Mixing speed (RPM),Mixing Temperature,Mixing Time (hours),Aggregates replacement ?,Bitumen replacement?,Filler replacement,Plastic Addition by bitumen weight (%),Property,Units,Bitumen content in the sample,MS of the sample (kN)
count,402,262,242.0,84.0,325.0,372.0,344.0,355.0,372.0,344.0,357.0,372.0,161,402,402,377,402,402,320,307.0,402,402,371.0,385.0,372.0,402,402,402,400.0,402,402,399.0,402.0
unique,2,7,,,,,,,,,,,5,5,3,10,2,4,3,,3,3,,,,3,3,3,,1,1,,
top,Yes,Granite,,,,,,,,,,,Stone dust,60/70,50/70,PE,Physical,Shredded,Fine,,Dry,No,,,,No,No,No,,MS,kN,,
freq,319,111,,,,,,,,,,,81,141,276,150,319,289,148,,200,240,,,,300,285,317,,402,402,,
mean,,,1.240269,2.686607,6.469785,15.079839,20.374273,37.026789,48.849785,68.082384,80.702129,92.721613,,,,,,,,3.332902,,,568.463612,42.883117,0.110914,,,,19.15397,,,5.198835,14.458749
std,,,0.946592,0.078388,5.324256,5.749322,6.530935,7.607279,8.780095,10.803423,11.58546,7.199423,,,,,,,,5.836038,,,1272.097676,72.981514,0.282826,,,,44.387176,,,0.820112,4.621909
min,,,0.13,2.59,2.19,5.3,9.0,19.77,23.95,47.0,66.63,83.11,,,,,,,,0.0,,,0.0,0.0,0.0,,,,0.0,,,2.5,1.36784
25%,,,0.47,2.6,4.62,10.66,14.77,33.47,41.54,59.85,69.38,84.46,,,,,,,,0.0,,,0.0,0.0,0.0,,,,1.0,,,4.8,10.9275
50%,,,1.02,2.685,5.28,13.78,20.56,35.86,49.35,67.84,79.86,94.94,,,,,,,,2.36,,,0.0,0.0,0.0,,,,5.0,,,5.2,14.69696
75%,,,2.15,2.725,6.24,17.67,22.82,39.0,54.77,75.97,90.0,100.0,,,,,,,,5.0,,,0.0,150.0,0.0,,,,12.0,,,5.5,17.2875
