In [2]:
# Data manipulation and handling libraries
import numpy as np
import pandas as pd

# Data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as sci

# Data preprocessing and EDL Libraries
from collections import OrderedDict
from sklearn.preprocessing import StandardScaler

# Loading the Dataframe
df=pd.read_excel('Capstone Project.xlsx')

In [6]:
df.head()

Unnamed: 0,cement,slag,ash,water,superplastic,coarseagg,fineagg,age,strength
0,141.3,212.0,0.0,203.5,0.0,971.8,748.5,28,29.89
1,168.9,42.2,124.3,158.3,10.8,1080.8,796.2,14,23.51
2,250.0,0.0,95.7,187.4,5.5,956.9,861.2,28,29.22
3,266.0,114.0,0.0,228.0,0.0,932.0,670.0,28,45.85
4,154.8,183.4,0.0,193.3,9.1,1047.4,696.7,28,18.29


#Exploratory Data Analysis (EDA)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030 entries, 0 to 1029
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   cement        1030 non-null   float64
 1   slag          1030 non-null   float64
 2   ash           1030 non-null   float64
 3   water         1030 non-null   float64
 4   superplastic  1030 non-null   float64
 5   coarseagg     1030 non-null   float64
 6   fineagg       1030 non-null   float64
 7   age           1030 non-null   int64  
 8   strength      1030 non-null   float64
dtypes: float64(8), int64(1)
memory usage: 72.5 KB


Observations from df.info()
1) There are no null values
2) We have 1030 rows and 9 columns
3) Dependent variable = Strength
4) Rest of all columns are independent variable

In [9]:
df.describe()

Unnamed: 0,cement,slag,ash,water,superplastic,coarseagg,fineagg,age,strength
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.167864,73.895825,54.18835,181.567282,6.20466,972.918932,773.580485,45.662136,35.817961
std,104.506364,86.279342,63.997004,21.354219,5.973841,77.753954,80.17598,63.169912,16.705742
min,102.0,0.0,0.0,121.8,0.0,801.0,594.0,1.0,2.33
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.71
50%,272.9,22.0,0.0,185.0,6.4,968.0,779.5,28.0,34.445
75%,350.0,142.95,118.3,192.0,10.2,1029.4,824.0,56.0,46.135
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.6


Analysis from descriptive statistics
1) There might be skewness in the data in all columns
2) There might be chances of outliers if we compare quartile of some columns
3) Since minimum and q1 values are same for slag and ash, so we do not have outliers in the lower whisker region
4) The range of the value 'age' is from 1 to 365. We can say that the values are in 'days'


In [36]:
def custom_summary(my_df):
    result = []
    for col in my_df.columns:
        if my_df[col].dtypes != 'object':
            stats = OrderedDict({
                'feature_name':col,
                'count':my_df[col].count(),
                'quartile1':my_df[col].quantile(.25),
                'quartile2':my_df[col].quantile(.50),
                'quartile3':my_df[col].quantile(.75),
                'mean':my_df[col].mean(),
                'max':my_df[col].max(),
                'variance':round(my_df[col].var()),
                'standard_deviation':my_df[col].std(),
                'skewness':my_df[col].skew(),
                'kurtosis':my_df[col].kurt() 
            })

        result.append(stats)
    result_df=pd.DataFrame(result)
    
    #skewness type
    skewness_label =[]
    for i in result_df['skewness']:
        if i <= -1:
            skewness_label.append('Highly negatively skewed')
        elif -1 < i <= -0.5:
            skewness_label.append('Moderately negatively skewed')
        elif -0.5 < i < 0:
            skewness_label.append('Fairly negatively skewed')
        elif 0 <= i <= 1:
            skewness_label.append('Fairly Positively skewed')
        elif 0.5 <= i < 1:
            skewness_label.append('Moderately Positively skewed')
        elif i >= 1:
            skewness_label.append('Highly Positively skewed')
    result_df['skewness_comment'] = skewness_label
    
    #kurtosis type:
    kurtosis_label = []
    for i in result_df['kurtosis']:
        if i >= 1:
            kurtosis_label.append('Leptokurtic Curve')
        elif i <= 1:
            kurtosis_label.append('Platykurtic Curve')
        else:
            kurtosis_label.append('Mesokurtic Curve')
    
    result_df['kurtosis_comment'] = kurtosis_label
    
    #outliers
    outliers_label = []
    for col in my_df.columns:
        if my_df[col].dtypes != 'object':
            q1 = my_df[col].quantile(.25)
            q2 = my_df[col].quantile(.50)
            q3 = my_df[col].quantile(.75)
            iqr = q3 - q1
            lower_whisker = q1-0.25*iqr
            upper_whisker = q3+0.25*iqr
            if len(my_df[(my_df[col] < lower_whisker) | (my_df[col] > upper_whisker) ]) > 0:
                outliers_label.append('Have outliers')
            else:
                outliers_label.append('No outliers')
    result_df['Outlier Comment'] = outliers_label
    return result_df

In [37]:
custom_summary(df)

Unnamed: 0,feature_name,count,quartile1,quartile2,quartile3,mean,max,variance,standard_deviation,skewness,kurtosis,skewness_comment,kurtosis_comment,Outlier Comment
0,cement,1030,192.375,272.9,350.0,281.167864,540.0,10922,104.506364,0.509481,-0.520652,Fairly Positively skewed,Platykurtic Curve,Have outliers
1,slag,1030,0.0,22.0,142.95,73.895825,359.4,7444,86.279342,0.800717,-0.508175,Fairly Positively skewed,Platykurtic Curve,Have outliers
2,ash,1030,0.0,0.0,118.3,54.18835,200.1,4096,63.997004,0.537354,-1.328746,Fairly Positively skewed,Platykurtic Curve,Have outliers
3,water,1030,164.9,185.0,192.0,181.567282,247.0,456,21.354219,0.074628,0.122082,Fairly Positively skewed,Platykurtic Curve,Have outliers
4,superplastic,1030,0.0,6.4,10.2,6.20466,32.2,36,5.973841,0.907203,1.411269,Fairly Positively skewed,Leptokurtic Curve,Have outliers
5,coarseagg,1030,932.0,968.0,1029.4,972.918932,1145.0,6046,77.753954,-0.04022,-0.599016,Fairly negatively skewed,Platykurtic Curve,Have outliers
6,fineagg,1030,730.95,779.5,824.0,773.580485,992.6,6428,80.17598,-0.25301,-0.102177,Fairly negatively skewed,Platykurtic Curve,Have outliers
7,age,1030,7.0,28.0,56.0,45.662136,365.0,3990,63.169912,3.269177,12.168989,Highly Positively skewed,Leptokurtic Curve,Have outliers
8,strength,1030,23.71,34.445,46.135,35.817961,82.6,279,16.705742,0.416977,-0.313725,Fairly Positively skewed,Platykurtic Curve,Have outliers
