### Loading the required packages

In [1]:
import warnings
import sys
import os
import csv
import time
import pandas as pd
import numpy as np
from numpy.random import randn

from sklearn import preprocessing
from scipy import stats
from scipy.stats import anderson
from scipy.stats import normaltest
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from statsmodels.graphics.factorplots import interaction_plot
from statsmodels.graphics.gofplots import qqplot

import seaborn as sns
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
import matplotlib.lines as mlines

import plotly.express as px
warnings.filterwarnings('ignore')

### Preview of the dataset

In [2]:
df = pd.read_csv('nebraska_deep.csv')
df.head()

Unnamed: 0,year,structureNumber,latitude,longitude,toll,owner,yearBuilt,averageDailyTraffic,designLoad,skew,...,baseDifferenceScore,precipitation,snowfall,freezethaw,deckDeteriorationScore,subDeteriorationScore,supDeteriorationScore,deckNumberIntervention,subNumberIntervention,supNumberIntervention
0,1992,C000100305,0,0,3,2,1935,30,0,0,...,0.230199,2.02,,,-0.25,-0.25,0.0,0.0,0.0,0.0
1,1992,C000100305P,0,0,3,2,1935,20,0,0,...,,,,,0.0,0.0,0.0,0.0,0.0,0.0
2,1992,C000100405,0,0,3,2,1925,65,0,40,...,,,,,0.0,0.0,0.0,0.0,0.0,0.0
3,1992,C000100505P,0,0,3,2,1974,60,4,30,...,,,,,0.0,-0.333333,0.0,1.0,0.0,0.0
4,1992,C000100905,0,0,3,2,1962,170,2,0,...,,,,,0.0,-1.125,0.0,1.0,1.0,1.0


### Understanding various categories of ADT

In [3]:
# Removing the N, U and NaN values in the scourCriticalBridges column
df['scourCriticalBridges'] = pd.to_numeric(df['scourCriticalBridges'], errors='coerce')
df = df.dropna(subset=['scourCriticalBridges'])
df = df[df.scourCriticalBridges != 'N']
df = df[df.scourCriticalBridges != 'U']
df=df.sort_values(by='scourCriticalBridges')
df['scourCriticalBridges'] = df['scourCriticalBridges'].astype('int')
df['SCB_category'] = pd.cut(df['scourCriticalBridges'],
                            [0, 4, 7, 9],
                            labels=['poor', 'average', 'good' ])
# Remove duplicates
df.drop_duplicates(subset=['structureNumber'], keep='last', inplace=True)

# Return total count of bridges wrt SCB categories
df.groupby(['SCB_category'])['SCB_category'].count().reset_index(name='counts')

Unnamed: 0,SCB_category,counts
0,poor,8
1,average,6223
2,good,9840


### Number of Bridges for each ScourCriticalBridges category 

In [11]:
# Plotting the bar chart to show the below bar chart
SCB_count = df.groupby(['SCB_category'])['SCB_category'].count().reset_index(name='counts')
#fig = px.bar(SCB_count, x='SCB_category', y='counts')
SCB_sum = df.groupby(['SCB_category'])['scourCriticalBridges'].sum().reset_index(name='sums')
merged_inner = pd.merge(left=SCB_count, right=SCB_sum, left_on='SCB_category', right_on='SCB_category')
merged_inner['avg']=merged_inner['sums']/merged_inner['counts']
print(merged_inner)
#fig = px.bar(yb_count, x='yb_category', y='counts')
fig = px.bar(merged_inner, x='SCB_category', y='avg',title="Means of SCB Category",text=merged_inner['avg'])
fig.show()

  SCB_category  counts   sums       avg
0         poor       8     28  3.500000
1      average    6223  37330  5.998714
2         good    9840  78738  8.001829


### Understanding the relationship between ADT and intervention

In [7]:
df_grouped = df.groupby(['SCB_category', 'subNumberIntervention'])['SCB_category'].count().reset_index(name='count')

# Initialize list
poor = []
average = []
good = []

# Identify unique groups,(Note: must be used with a dictionary to ensure positions are consitent)
unique_interventions = df_grouped['subNumberIntervention'].unique()

# Iterate through the dataframe
for ind in df_grouped.index:
    cat, numInt, total = df_grouped['SCB_category'][ind], \
                          df_grouped['subNumberIntervention'][ind], \
                          df_grouped['count'][ind]
                
    if cat == 'poor':
        poor.append(total)
    elif cat == 'average':
        average.append(total)
    else:
        good.append(total)

### Percentage of bridges requiring substructure repair based on the ScourCriticalBridges category

In [12]:
# Replacing the numbers with words for better readability
df_grouped['subNumberIntervention']=df_grouped['subNumberIntervention'].replace([0.0,1.0,2.0,3.0], ['Zero','One','Two', 'Three'])
print(df_grouped)
# Taking the percentage of bridges using count
df_grouped['Percentage'] = 100 * df_grouped['count'] / df_grouped.groupby('SCB_category')['count'].transform('sum')
df_grouped['Percentage'] = [round(item, 2) for item in df_grouped['Percentage']]

#Plotting the bar chart
fig=px.bar(df_grouped,x='subNumberIntervention',y='Percentage',color=  
    'SCB_category',barmode="group" ,title="Comparing SCB Category with Substructure Interventions",
     text=df_grouped['Percentage'])
fig.show()

   SCB_category subNumberIntervention  count  Percentage
0          poor                  Zero      5       71.43
1          poor                   One      0        0.00
2          poor                   Two      1       14.29
3          poor                 Three      1       14.29
4       average                  Zero   4072       77.98
5       average                   One    985       18.86
6       average                   Two    159        3.04
7       average                 Three      6        0.11
8          good                  Zero   8958       94.94
9          good                   One    459        4.86
10         good                   Two     18        0.19
11         good                 Three      0        0.00


Bridges with high Scour ratings are more in zero substructure interventions, whereas those with average ratings have more in one and two substructure interventions.

### Kenel Density Distribution using Seaborn

In [None]:
x4

In [None]:
res=sns.displot(x='scourCriticalBridges',data=df,hue='subNumberIntervention', kind="kde",  multiple="stack");
plt.title("Kernel Density Plot")

### Normal distribution using Plotly

In [None]:
import plotly.figure_factory as ff
import numpy as np

# Add histogram data
df1=df[(df['subNumberIntervention']==0.0)]
x1 = df1['scourCriticalBridges']

df2=df[(df['subNumberIntervention']==1.0)]
x2 = df2['scourCriticalBridges']

df3=df[(df['subNumberIntervention']==2.0)]
x3 = df3['scourCriticalBridges']

df4=df[(df['subNumberIntervention']==3.0)]
x4 = df4['scourCriticalBridges']

# Group data together
hist_data = [x1, x2, x3, x4]

group_labels = ['Zero', 'One', 'Two', 'Three']

# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels, show_hist=False,bin_size=1)#curve_type='normal'
fig.show()

In [None]:
import plotly.figure_factory as ff
import numpy as np

# Add histogram data
df1=df[(df['scourCriticalBridges']==0) |(df['scourCriticalBridges']==1)|(df['scourCriticalBridges']==2)|(df['scourCriticalBridges']==3) ]
x1 = df1['scourCriticalBridges']

df2=df[(df['scourCriticalBridges']==4) |(df['scourCriticalBridges']==5)|(df['scourCriticalBridges']==6) ]
x2 = df2['scourCriticalBridges']

df3=df[(df['scourCriticalBridges']==7) |(df['scourCriticalBridges']==8)|(df['scourCriticalBridges']==9) ]
x3 = df3['scourCriticalBridges']

# Group data together
hist_data = [x1, x2, x3]

group_labels = ['Poor', 'Average', 'Good']

# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels, show_hist=False,curve_type='normal',bin_size=0.1)
fig.show()