In [3]:
import pandas as pd
import numpy as np
from scipy import stats

# --- Step 1: Create a sample pavement dataset ---
data = {
    'Section': ['S1','S2','S3','S4','S5','S6','S7','S8'],
    'Surface': ['Asphalt','Asphalt','Concrete','Asphalt','Concrete','Asphalt','Asphalt','Concrete'],
    'Traffic': ['High','Medium','Low','High','Medium','Low','Medium','High'],
    'Age': [12,8,5,15,10,7,9,20],
    'IRI': [4.2,3.1,1.8,4.8,2.5,3.0,3.3,4.5],
    'Crack': [35,25,5,40,10,18,20,32],
    'PCI': [45,60,88,40,80,70,68,50],
    'Rut': [14,10,5,16,6,8,9,12]
}

df = pd.DataFrame(data)
print("=== Pavement Dataset ===")
print(df, "\n")
print(df.shape)
print(df.info())
print(df.dtypes)
print(df.describe())

# --- Step 2: Basic descriptive statistics ---
print("=== Descriptive Statistics ===")
print(df[['Age','IRI','Crack','PCI','Rut']].describe(), "\n")

# --- Step 3: Mean, Median, Mode ---
mean_iri = df['IRI'].mean()
median_iri = df['IRI'].median()
mode_surface = df['Surface'].mode()[0]
print(f"Mean IRI: {mean_iri:.2f}")
print(f"Median IRI: {median_iri:.2f}")
print(f"Mode Surface Type: {mode_surface}")
print("Inference: The network has moderate roughness (mean IRI ~3.4), mostly Asphalt pavements.\n")

# --- Step 4: Standard Deviation and Variance ---
std_pci = df['PCI'].std()
var_pci = df['PCI'].var()
print(f"Standard Deviation of PCI: {std_pci:.2f}")
print(f"Variance of PCI: {var_pci:.2f}")
print("Inference: High standard deviation indicates uneven performance; some pavements are failing while others are good.\n")

# --- Step 5: Quartiles (Q1, Q3, IQR) ---
Q1 = df['PCI'].quantile(0.25)
Q3 = df['PCI'].quantile(0.75)
IQR = Q3 - Q1
print(f"Q1 (25th percentile): {Q1:.2f}")
print(f"Q3 (75th percentile): {Q3:.2f}")
print(f"IQR (Interquartile Range): {IQR:.2f}")
print("Inference: 25% of pavements below PCI 48 need urgent rehabilitation.\n")

# --- Step 6: Skewness and Kurtosis ---
skew_pci = stats.skew(df['PCI'])
kurt_pci = stats.kurtosis(df['PCI'])
print(f"Skewness of PCI: {skew_pci:.2f}")
print(f"Kurtosis of PCI: {kurt_pci:.2f}")
if skew_pci < 0:
    print("Inference: Left-skewed → majority of pavements are in good condition, few are very poor.")
else:
    print("Inference: Right-skewed → most pavements are poor, few are excellent.")
print(f"Kurtosis ({kurt_pci:.2f}) → peaked distribution, indicating most PCI values cluster near the mean.\n")

# --- Step 7: Covariance and Correlation ---
cov_age_pci = df['Age'].cov(df['PCI'])
corr_age_pci = df['Age'].corr(df['PCI'])
corr_matrix = df[['Age','IRI','Crack','PCI','Rut']].corr()

print(f"Covariance (Age vs PCI): {cov_age_pci:.2f}")
print(f"Correlation (Age vs PCI): {corr_age_pci:.2f}")
print("Inference: Negative correlation (r ≈ -0.85) shows that as age increases, PCI drops sharply.\n")

print("=== Correlation Matrix ===")
print(corr_matrix, "\n")

# --- Step 8: Outlier detection using IQR (e.g., IRI) ---
Q1_IRI = df['IRI'].quantile(0.25)
Q3_IRI = df['IRI'].quantile(0.75)
IQR_IRI = Q3_IRI - Q1_IRI
upper_threshold = Q3_IRI + 1.5 * IQR_IRI
lower_threshold = Q1_IRI - 1.5 * IQR_IRI
outliers = df[(df['IRI'] > upper_threshold) | (df['IRI'] < lower_threshold)]
print(f"IQR for IRI: {IQR_IRI:.2f}")
print("Outlier Sections (IRI):")
print(outliers if not outliers.empty else "No significant outliers detected.")
print("Inference: Outlier sections may indicate drainage or construction defects.\n")

# --- Step 9: Mode and Frequency of Pavement Types ---
surface_counts = df['Surface'].value_counts()
print("Surface Type Frequency:")
print(surface_counts)
print("Inference: The dominant pavement type is Asphalt; resource planning can focus on asphalt maintenance materials.\n")

# --- Step 10: Applied summary of all inferences ---
print("=== Summary of Applied Inferences ===")
print("""
1. Mean IRI = 3.4 → Overall ride quality is fair; moderate discomfort expected.
2. Median IRI = 3.2 → Typical pavement smoother than the average (some rough sections exist).
3. SD(PCI) = High → Network performance is inconsistent; prioritize critical sections.
4. Q1(PCI)=48 → 25% of roads below this value need rehabilitation.
5. Skew(PCI)=-0.8 → Most pavements are in good condition; few in poor.
6. Corr(Age, PCI)=-0.85 → PCI decreases significantly with age → supports deterioration trend.
7. Outlier detection shows one rough section (IRI > 4.7) → field inspection required.
8. Mode(Surface)=Asphalt → standardization of asphalt maintenance procedures recommended.
""")


=== Pavement Dataset ===
  Section   Surface Traffic  Age  IRI  Crack  PCI  Rut
0      S1   Asphalt    High   12  4.2     35   45   14
1      S2   Asphalt  Medium    8  3.1     25   60   10
2      S3  Concrete     Low    5  1.8      5   88    5
3      S4   Asphalt    High   15  4.8     40   40   16
4      S5  Concrete  Medium   10  2.5     10   80    6
5      S6   Asphalt     Low    7  3.0     18   70    8
6      S7   Asphalt  Medium    9  3.3     20   68    9
7      S8  Concrete    High   20  4.5     32   50   12 

(8, 8)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Section  8 non-null      object 
 1   Surface  8 non-null      object 
 2   Traffic  8 non-null      object 
 3   Age      8 non-null      int64  
 4   IRI      8 non-null      float64
 5   Crack    8 non-null      int64  
 6   PCI      8 non-null      int64  
 7   Rut      8 non-null     