# A short analysis of the compressive strength of concrete. 
> Dataset from https://www.kaggle.com/datasets/sinamhd9/concrete-comprehensive-strength

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
sns.set_style("darkgrid", {"axes.facecolor": ".95"})
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor

from timeit import default_timer as timer

import xml.etree.ElementTree as ET
from xml.dom import minidom
import os
import io
from datetime import datetime

In [None]:
%pip install xlwt

### Exploratory Data Analysis

In [None]:
df = pd.read_excel('data/Concrete_Data.xls')
df.info()

##### Save df.info() for documentation purposes

In [None]:
buffer = io.StringIO()
df.info(buf=buffer)
s = buffer.getvalue()

with open(os.path.join("./Metadata/", ("df_info.txt")), "w", encoding="utf-8") as f:  
    f.write(s)


In [None]:
df.hist(bins=50, figsize=(18,12))
plt.show()

##### Transform column names for more ease of use

In [None]:
old_columns = df.columns

df.columns = ['cement', 'slag', 'ash', 'water', 'superplasticizer', 'coarse_agg', 'fine_agg', 'age', 'strength']

In [None]:
plt.figure(figsize=(18, 12))
plt.title('Average correlation of features')
sns.heatmap(df.corr(), annot=True, cmap='coolwarm_r')
plt.show()

In [None]:
df.corr()['strength'].sort_values(ascending=False)

In [None]:
columns = df.columns.drop(['strength'])

for i in range(len(columns)):
    sns.jointplot(x=columns[i],
                  y='strength',
                  data=df,
                  kind='reg',
                  color=sns.color_palette("Paired", 8)[i])
    
plt.show()

#### Feature engineering

In [None]:
def calc_mi_scores(X, y):
    mi_scores = mutual_info_regression(X, y)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index = X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

In [None]:
fe_df = df.copy()

fe_df['total'] = df.drop(['strength'], axis=1).sum(axis=1)

fe_df['cement_root'] = (fe_df['slag'] + fe_df['ash'] + fe_df['coarse_agg'] + fe_df['fine_agg']) / fe_df['total']	

fe_df['water_ratio'] = fe_df['water'] / fe_df['total']

fe_df['age_root'] = np.sqrt(fe_df['age'])

fe_df['super_used'] = df['superplasticizer'].map(lambda l: 1 if l > 0 else 0)

fe_df.head()

#### MI-Scores of the feature engineered and non feature engieered datasets

In [None]:
fe_X = fe_df.drop(['strength'], axis=1)
fe_y = fe_df['strength']

nofe_X = df.drop(['strength'], axis=1)
nofe_y = df['strength']

mi_scores = pd.DataFrame({'No FE': calc_mi_scores(nofe_X, nofe_y), 'FE': calc_mi_scores(fe_X, fe_y)})
mi_scores = mi_scores.sort_values(by='FE', ascending=False)
mi_scores

#### Model tester using cross-validation

In [None]:
def model_tester(df, models):
    results = {"ModelName" : [], "MeanCVScore" : [], "StdCVScore" : []}
    for i in range(len(models)):
        start = timer()
        
        X = df.drop('strength', axis=1)
        y = df['strength']
        scaler = StandardScaler()
        model = models[i]
        
        steps = [('scaler', scaler), ('regressor', model)]
        pipeline = Pipeline(steps)
        
        cv_results = cross_val_score(pipeline, X, y, cv=10)
        
        end = timer()
        
        print(f'{model.__class__.__name__} CV score: {cv_results.mean():.4f} +/- {cv_results.std():.4f} Time: {(end - start):.2f}s')
        
        results["ModelName"].append(model.__class__.__name__)
        results["MeanCVScore"].append(cv_results.mean())
        results["StdCVScore"].append(cv_results.std())
        
    return results

#### No feature engineering Linear Regression

In [None]:
X = df.drop('strength', axis=1)
y = df['strength']

model = LinearRegression()
scaler = StandardScaler()
steps = [('scaler', scaler), ('regressor', model)]
pipeline = Pipeline(steps)

cv_results = cross_val_score(pipeline, X, y, cv=5)
print(cv_results)
np.mean(cv_results)

#### Testing different models on feature engineered data

In [None]:
constant_state = 42

models = [LinearRegression(n_jobs=-1),
          Lasso(max_iter=1000, alpha=0.1, random_state=constant_state), 
          Ridge(alpha=0.05, random_state=constant_state),
          XGBRegressor(n_estimators=500, learning_rate=0.1, n_jobs=-1, random_state=constant_state),
          RandomForestRegressor(n_estimators=500, n_jobs=-1, random_state=constant_state),
          ]

results = model_tester(fe_df.sample(frac=1), models)

results_df = pd.DataFrame(results)


plt.figure(figsize=(12, 8))
plt.title('Average CV score of different models')
ax = sns.barplot(x='ModelName', y='MeanCVScore', data=results_df)
ax.set(xlabel='Model', ylabel='Mean CV Score', ylim=(0, 1))
plt.show()

#### Exporting feature engineered data

In [None]:
fe_df.to_excel('data/FE_Concrete_Data.xls', sheet_name='FE_Concrete_Data')
fe_df.to_csv('data/FE_Concrete_Data.csv')

#### Creating metadata for our data

In [None]:
def GenerateXML(filename="export", path="./Metadata/", metadata={"id": 1, "name": "test", "date": datetime.today().strftime('%d-%m-%Y')}):
    
    root = ET.Element("doc")
    
    for key in metadata.keys():
        entry = ET.SubElement(root, key)
        entry.text = str(metadata[key])
    
    xmlstr = minidom.parseString(ET.tostring(root)).toprettyxml(indent="   ")
    
    with open (os.path.join(path, (filename + ".xml")), "wb") as f:
        print(f"XML generated at path: {path} as: {filename}.xml")
        f.write(xmlstr.encode('utf-8'))

In [101]:
my_metadata = {
    "id": 42,
    "name": "Ryland Grace",
    "institute": "HU Berlin",
    "date_published": datetime.today().strftime('%d-%m-%Y'),
    "PID": "123-456-789"
}

GenerateXML(metadata=my_metadata)

XML generated at path: ./Metadata/ as: export.xml
