In [1]:
import pandas as pd
import plotly.express as px
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express 
import matplotlib.pyplot as plt
import csv
import os

#number of decimals to keep
NUM_DECIMAL = 2

#header for year
HEADER_YEAR = ['year', 'count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']

#header for state
HEADER_STATE = ['state', 'count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']


if __name__ == '__main__':
    #read the dataset
    df = pd.read_csv('data/locations_inspectionscores_forMeri_Nov.csv')
    
    #inspection score data
    inspection_score = df['INSPECTION_SCORE']
    
    #statistic CSV table -- by year
    stat_file = open('figures/stats_inspection_score_year.csv', 'w') 
    
    stat_writer = csv.writer(stat_file)
    
    stat_writer.writerow(HEADER_YEAR)

    #overall inspection score statistics
    stats = inspection_score.describe().round(NUM_DECIMAL).to_list()
    
    stats.insert(0, 'overall')
    
    stat_writer.writerow(stats)

    #get all available years and get them sorted
    all_years = sorted(list(set(df['inspection_year'].astype(np.int32))))

    #FIXME: remove "2005" outlier value
    all_years.remove(2005); 

    #each-year inspection score statistics
    for yr in all_years:
        inspection_score_each = df.loc[df['inspection_year'] == yr]['INSPECTION_SCORE']
        
        stats_each = inspection_score_each.describe().round(NUM_DECIMAL).to_list()
        
        stats_each.insert(0, yr)
        
        stat_writer.writerow(stats_each)
    
    #close the file
    stat_file.close()

    #get all available states and get them sorted
    all_states = sorted(list(set(df['STATE_NAME.x'])))

    #statistic CSV table -- by state
    stat_file = open('figures/stats_inspection_score_state.csv', 'w') 
    
    stat_writer = csv.writer(stat_file)
    
    stat_writer.writerow(HEADER_STATE)

    #each-state inspection score statistics
    for st in all_states:
        inspection_score_each = df.loc[df['STATE_NAME.x'] == st]['INSPECTION_SCORE']

        stats_each = inspection_score_each.describe().round(NUM_DECIMAL).to_list()
        
        stats_each.insert(0, st)
        
        stat_writer.writerow(stats_each)
    
    stat_file.close()

    #histograms of inspection scores with each individual year
    i = 0
    for yr in all_years:
        inspection_score_each = df.loc[df['inspection_year'] == yr]['INSPECTION_SCORE']
        plt.figure(f'{i}')
        sns.histplot(inspection_score_each)
        plt.title(f'inspection score in {yr}')
        plt.savefig(f'figures/dist_by_year/histagram/hist_{yr}.png')
        i += 1

    #inspection scores changed over years
    mean_years = pd.DataFrame()
    for yr in all_years:
        inspection_score_each = df.loc[df['inspection_year'] == yr]['INSPECTION_SCORE']
        another_year = pd.DataFrame({f'{yr}': [inspection_score_each.mean()]})
        mean_years = pd.concat([mean_years, another_year], axis=1)

    plt.figure(f'{i}')
    plt.plot(mean_years.columns.to_list(), mean_years.iloc[0, ], label="mean")
    plt.xlabel('time in years')
    plt.ylabel('inspection score')
    plt.title('inspection score over time in U.S.')

    #inspection scores changed over years  -- median as reference 
    median_years = pd.DataFrame()
    for yr in all_years:
        inspection_score_each = df.loc[df['inspection_year'] == yr]['INSPECTION_SCORE']
        another_year = pd.DataFrame({f'{yr}': [inspection_score_each.median()]})
        median_years = pd.concat([median_years, another_year], axis=1)

    plt.plot(median_years.columns.to_list(), median_years.iloc[0, ], label="median")
    plt.legend()
    plt.savefig('figures/dist_by_year/line_plot/US.png')
    i += 1

    #inspection scores changed over years -- for each state
    i = 0
    for st in all_states:
        #select state-specific dataframe
        temp_df = df.loc[df['STATE_NAME.x'] == st]

        #ensure year data is available
        all_years_state = sorted(list(set(temp_df['inspection_year'].astype(np.int32))))

        #inspection scores changed over years
        mean_years = pd.DataFrame()
        for yr in all_years_state:
            
            inspection_score_each = temp_df.loc[temp_df['inspection_year'] == yr]['INSPECTION_SCORE']
            another_year = pd.DataFrame({f'{yr}': [inspection_score_each.mean()]})
            mean_years = pd.concat([mean_years, another_year], axis=1)

        #use different figures to avoid superimposing
        plt.figure(i)
        i += 1
        
        plt.plot(mean_years.columns.to_list(), mean_years.iloc[0, ], label="mean")
        plt.xlabel('time in years')
        plt.ylabel('inspection score')
        plt.title(f'inspection score over time in {st}')


        #inspection scores changed over years  -- median as reference 
        median_years = pd.DataFrame()
        for yr in all_years_state:
            inspection_score_each = temp_df.loc[temp_df['inspection_year'] == yr]['INSPECTION_SCORE']
            another_year = pd.DataFrame({f'{yr}': [inspection_score_each.median()]})
            median_years = pd.concat([median_years, another_year], axis=1)

        plt.plot(median_years.columns.to_list(), median_years.iloc[0, ], label="median")
        plt.legend()
        plt.savefig(f'figures/dist_by_year/line_plot/{st}.png')

FileNotFoundError: [Errno 2] No such file or directory: 'data/locations_inspectionscores_forMeri_Nov.csv'

In [None]:
state_mapping = {
    'AK': 'Alaska',
    'AL': 'Alabama',
    'AR': 'Arkansas',
    'AZ': 'Arizona',
    'CA': 'California',
    'CO': 'Colorado',
    'CT': 'Connecticut',
    'DC': 'District of Columbia',
    'DE': 'Delaware',
    'FL': 'Florida',
    'GA': 'Georgia',
    'HI': 'Hawaii',
    'IA': 'Iowa',
    'ID': 'Idaho',
    'IL': 'Illinois',
    'IN': 'Indiana',
    'KS': 'Kansas',
    'KY': 'Kentucky',
    'LA': 'Louisiana',
    'MA': 'Massachusetts',
    'MD': 'Maryland',
    'ME': 'Maine',
    'MI': 'Michigan',
    'MN': 'Minnesota',
    'MO': 'Missouri',
    'MS': 'Mississippi',
    'MT': 'Montana',
    'NC': 'North Carolina',
    'ND': 'North Dakota',
    'NE': 'Nebraska',
    'NH': 'New Hampshire',
    'NJ': 'New Jersey',
    'NM': 'New Mexico',
    'NV': 'Nevada',
    'NY': 'New York',
    'OH': 'Ohio',
    'OK': 'Oklahoma',
    'OR': 'Oregon',
    'PA': 'Pennsylvania',
    'RI': 'Rhode Island',
    'SC': 'South Carolina',
    'SD': 'South Dakota',
    'TN': 'Tennessee',
    'TX': 'Texas',
    'UT': 'Utah',
    'VT': 'Vermont',
    'VA': 'Virginia',
    'WA': 'Washington',
    'WI': 'Wisconsin',
    'WV': 'West Virginia',
    'WY': 'Wyoming'
}

## Sanity Check == map of the raw facility locations in the states
## Make sure that we are not missing in the dataset

In [None]:
state = pd.read_csv('../figures/stats_inspection_score_state.csv')
year  = pd.read_csv('../figures/stats_inspection_score_year.csv')[1:]
state['state_state'] = state['state'].map(state_mapping)
state

Unnamed: 0,state,count,mean,std,min,25%,50%,75%,max,state_state
0,AK,59.0,82.75,12.43,58.0,73.5,86.0,93.0,99.0,Alaska
1,AL,1328.0,80.83,14.61,27.0,72.0,84.0,93.0,100.0,Alabama
2,AR,530.0,82.15,13.4,30.0,76.0,86.0,92.0,100.0,Arkansas
3,AZ,213.0,83.32,12.57,42.0,75.0,86.0,94.0,100.0,Arizona
4,CA,1064.0,80.51,14.39,25.0,71.0,85.0,92.0,100.0,California
5,CO,290.0,84.84,10.54,39.0,78.0,86.0,93.0,100.0,Colorado
6,CT,611.0,78.75,15.61,18.0,69.0,82.0,92.0,100.0,Connecticut
7,DC,313.0,67.94,18.26,22.0,56.0,68.0,83.0,100.0,District of Columbia
8,DE,123.0,81.02,15.26,42.0,71.0,86.0,93.0,100.0,Delaware
9,FL,915.0,82.36,13.64,33.0,75.0,86.0,93.0,100.0,Florida


In [None]:
fig = px.choropleth(
    state,
    locations='state',
    color='mean',  
    hover_name='state_state',
    locationmode='USA-states',
    color_continuous_scale='blues',
    title='Mean Inspection Score by State'
)
fig.update_coloraxes(colorbar_title='Mean')
fig.update_geos(scope='usa')
## color blind friendly color pallette
fig.show()

In [None]:
fig = px.line(year, x='year', y='50%', markers=True)
fig.update_layout(
    title='Median Inspection Score  Over Years',
    xaxis_title='Year',
    yaxis_title='Median Inspection Score',
    legend_title_text='Legend')

# Show the plot
fig.show()

In [None]:
fig = px.bar(state, x='state', y='count', width=1200, height=800)
fig.update_layout(
    title='Number of Displaced people in Different States',
    yaxis_title='Count',
    xaxis_title='State'
)
fig.show()

In [None]:
## Number of the observations not the number of displaced people

fig = px.choropleth(
    state,
    locations='state',
    color='count',  # Assuming 'mean' is the column containing the mean inspection scores
    hover_name='state',  # Hover information
    locationmode='USA-states',  # Set the location mode to USA-states
    color_continuous_scale='greens')
fig.update_layout(
    title='Number of Displaced people in Different States',
    yaxis_title='Count',
    xaxis_title='State'
)

fig.update_coloraxes(colorbar_title='Count')
fig.update_geos(scope='usa')  # Set the map scope to USA

fig.show()

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express 
import matplotlib.pyplot as plt
import csv
import os

#number of decimals to keep
NUM_DECIMAL = 2

#header for year
HEADER_YEAR = ['year', 'count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']

#header for state
HEADER_STATE = ['state', 'count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']


if __name__ == '__main__':
    #read the dataset
    df = pd.read_csv('data/locations_inspectionscores_forMeri_Nov.csv')
    
    #inspection score data
    inspection_score = df['INSPECTION_SCORE']
    
    #statistic CSV table -- by year
    stat_file = open('figures/stats_inspection_score_year.csv', 'w') 
    
    stat_writer = csv.writer(stat_file)
    
    stat_writer.writerow(HEADER_YEAR)

    #overall inspection score statistics
    stats = inspection_score.describe().round(NUM_DECIMAL).to_list()
    
    stats.insert(0, 'overall')
    
    stat_writer.writerow(stats)

    #get all available years and get them sorted
    all_years = sorted(list(set(df['inspection_year'].astype(np.int32))))

    #FIXME: remove "2005" outlier value
    all_years.remove(2005); 

    #each-year inspection score statistics
    for yr in all_years:
        inspection_score_each = df.loc[df['inspection_year'] == yr]['INSPECTION_SCORE']
        
        stats_each = inspection_score_each.describe().round(NUM_DECIMAL).to_list()
        
        stats_each.insert(0, yr)
        
        stat_writer.writerow(stats_each)
    
    #close the file
    stat_file.close()

    #get all available states and get them sorted
    all_states = sorted(list(set(df['STATE_NAME.x'])))

    #statistic CSV table -- by state
    stat_file = open('figures/stats_inspection_score_state.csv', 'w') 
    
    stat_writer = csv.writer(stat_file)
    
    stat_writer.writerow(HEADER_STATE)

    #each-state inspection score statistics
    for st in all_states:
        inspection_score_each = df.loc[df['STATE_NAME.x'] == st]['INSPECTION_SCORE']

        stats_each = inspection_score_each.describe().round(NUM_DECIMAL).to_list()
        
        stats_each.insert(0, st)
        
        stat_writer.writerow(stats_each)
    
    stat_file.close()

    #histograms of inspection scores with each individual year
    i = 0
    for yr in all_years:
        inspection_score_each = df.loc[df['inspection_year'] == yr]['INSPECTION_SCORE']
        plt.figure(f'{i}')
        sns.histplot(inspection_score_each)
        plt.title(f'inspection score in {yr}')
        plt.savefig(f'figures/dist_by_year/histagram/hist_{yr}.png')
        i += 1

    #inspection scores changed over years
    mean_years = pd.DataFrame()
    for yr in all_years:
        inspection_score_each = df.loc[df['inspection_year'] == yr]['INSPECTION_SCORE']
        another_year = pd.DataFrame({f'{yr}': [inspection_score_each.mean()]})
        mean_years = pd.concat([mean_years, another_year], axis=1)

    plt.figure(f'{i}')
    plt.plot(mean_years.columns.to_list(), mean_years.iloc[0, ], label="mean")
    plt.xlabel('time in years')
    plt.ylabel('inspection score')
    plt.title('inspection score over time in U.S.')

    #inspection scores changed over years  -- median as reference 
    median_years = pd.DataFrame()
    for yr in all_years:
        inspection_score_each = df.loc[df['inspection_year'] == yr]['INSPECTION_SCORE']
        another_year = pd.DataFrame({f'{yr}': [inspection_score_each.median()]})
        median_years = pd.concat([median_years, another_year], axis=1)

    plt.plot(median_years.columns.to_list(), median_years.iloc[0, ], label="median")
    plt.legend()
    plt.savefig('figures/dist_by_year/line_plot/US.png')
    i += 1

    #inspection scores changed over years -- for each state
    i = 0
    for st in all_states:
        #select state-specific dataframe
        temp_df = df.loc[df['STATE_NAME.x'] == st]

        #ensure year data is available
        all_years_state = sorted(list(set(temp_df['inspection_year'].astype(np.int32))))

        #inspection scores changed over years
        mean_years = pd.DataFrame()
        for yr in all_years_state:
            
            inspection_score_each = temp_df.loc[temp_df['inspection_year'] == yr]['INSPECTION_SCORE']
            another_year = pd.DataFrame({f'{yr}': [inspection_score_each.mean()]})
            mean_years = pd.concat([mean_years, another_year], axis=1)

        #use different figures to avoid superimposing
        plt.figure(i)
        i += 1
        
        plt.plot(mean_years.columns.to_list(), mean_years.iloc[0, ], label="mean")
        plt.xlabel('time in years')
        plt.ylabel('inspection score')
        plt.title(f'inspection score over time in {st}')


        #inspection scores changed over years  -- median as reference 
        median_years = pd.DataFrame()
        for yr in all_years_state:
            inspection_score_each = temp_df.loc[temp_df['inspection_year'] == yr]['INSPECTION_SCORE']
            another_year = pd.DataFrame({f'{yr}': [inspection_score_each.median()]})
            median_years = pd.concat([median_years, another_year], axis=1)

        plt.plot(median_years.columns.to_list(), median_years.iloc[0, ], label="median")
        plt.legend()
        plt.savefig(f'figures/dist_by_year/line_plot/{st}.png')

Custom TB Handler failed, unregistering


[1;31m---------------------------------------------------------------------------[0m
[1;31mFileNotFoundError[0m                         Traceback (most recent call last)
    [1;31m[... skipping hidden 1 frame][0m

Cell [1;32mIn[10], line 21[0m
[0;32m     19[0m [38;5;28;01mif[39;00m [38;5;18m__name__[39m [38;5;241m==[39m [38;5;124m'[39m[38;5;124m__main__[39m[38;5;124m'[39m:
[0;32m     20[0m     [38;5;66;03m#read the dataset[39;00m
[1;32m---> 21[0m     df [38;5;241m=[39m [43mpd[49m[38;5;241;43m.[39;49m[43mread_csv[49m[43m([49m[38;5;124;43m'[39;49m[38;5;124;43mdata/locations_inspectionscores_forMeri_Nov.csv[39;49m[38;5;124;43m'[39;49m[43m)[49m
[0;32m     23[0m     [38;5;66;03m#inspection score data[39;00m

File [1;32mc:\Users\Farhan\AppData\Local\Programs\Python\Python311\Lib\site-packages\pandas\util\_decorators.py:211[0m, in [0;36mdeprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper[1;34m(*args, **kwargs)[0m
[0;32m    210[0m

TypeError: AutoFormattedTB.structured_traceback() missing 1 required positional argument: 'evalue'