In [1]:
import pandas as pd

### Table 1 - Top 10 US states by coverage i.e. 10 US states with the highest total articles per capita

In [23]:
cities = pd.read_csv('us_cities_by_state_SEPT.2023.csv')
cities = cities.groupby('state', as_index = False).size()

In [24]:
cities.head()

Unnamed: 0,state,size
0,Alabama,922
1,Alaska,149
2,Arizona,91
3,Arkansas,500
4,California,482


In [25]:
population = pd.read_csv('population_by_state.csv')

In [26]:
# Merging the two dataframes on state to obtain both number of articles and population for each state to calculate total articles per capita
statewise_total_articles = pd.merge(cities, population, on = "state")
statewise_total_articles['total_articles_per_capita'] = statewise_total_articles['size'] / statewise_total_articles['population']

Displaying Table 1

In [28]:
# Displaying top 10 states in terms of articles_per_capita in descending order
statewise_total_articles.sort_values(['total_articles_per_capita'], ascending=[0]).head(10)[['state', 'total_articles_per_capita']]

Unnamed: 0,state,total_articles_per_capita
32,Vermont,0.000508
16,Maine,0.000349
12,Iowa,0.000326
1,Alaska,0.000203
28,Pennsylvania,0.000197
0,Alabama,0.000182
19,Michigan,0.000177
36,Wyoming,0.00017
3,Arkansas,0.000164
22,Missouri,0.000154


### Table 2 - Bottom 10 US states by coverage i.e. 10 US states with the lowest total articles per capita

Displaying Table 2

In [18]:
statewise_total_articles.sort_values(['total_articles_per_capita'], ascending=[0]).tail(10).iloc[::-1][['state', 'total_articles_per_capita']]

Unnamed: 0,state,total_articles_per_capita
24,Nevada,6e-06
4,California,1.2e-05
2,Arizona,1.2e-05
7,Florida,1.9e-05
26,Oklahoma,1.9e-05
13,Kansas,2.1e-05
17,Maryland,2.5e-05
33,Virginia,3.1e-05
35,Wisconsin,3.3e-05
34,Washington,3.6e-05


### Table 3 - Top 10 US states by high quality i.e. 10 US states with the highest high quality articles per capita

In [33]:
scored_articles = pd.read_csv("wp_scored_city_articles_by_state.csv")
cities = pd.read_csv('us_cities_by_state_SEPT.2023.csv')

In [36]:
combined = pd.concat([scored_articles, cities], axis=1)
# removing duplicate columns
combined = combined.loc[:,~combined.columns.duplicated()].copy()

In [44]:
population = pd.read_csv('population_by_state.csv')

In [47]:
# Filter for good quality articles (article_quality values FA and GA)
good_quality_df = combined[combined['article_quality'].isin(['FA', 'GA'])]

In [54]:
good_quality_df.head()

Unnamed: 0,state,regional_division,population,article_title,revision_id,article_quality,page_title,url
3,Alabama,South - East South Central,5074296.0,"Akron, Alabama",1165910000.0,GA,"Akron, Alabama","https://en.wikipedia.org/wiki/Akron,_Alabama"
6,Alabama,South - East South Central,5074296.0,"Alexander City, Alabama",1179140000.0,GA,"Alexander City, Alabama","https://en.wikipedia.org/wiki/Alexander_City,_..."
7,Alabama,South - East South Central,5074296.0,"Aliceville, Alabama",1167792000.0,GA,"Aliceville, Alabama","https://en.wikipedia.org/wiki/Aliceville,_Alabama"
14,Alabama,South - East South Central,5074296.0,"Ardmore, Alabama",1176903000.0,GA,"Ardmore, Alabama","https://en.wikipedia.org/wiki/Ardmore,_Alabama"
33,Alabama,South - East South Central,5074296.0,"Bear Creek, Alabama",1166015000.0,GA,"Bear Creek, Alabama","https://en.wikipedia.org/wiki/Bear_Creek,_Alabama"


In [48]:
# Group by 'state' and count the number of good quality articles for each state
state_counts = good_quality_df.groupby('state')['article_quality'].count().reset_index()
state_counts.rename(columns={'article_quality': 'high_quality_count'}, inplace=True)

In [55]:
state_counts.head()

Unnamed: 0,state,high_quality_count
0,Alabama,106
1,Alaska,31
2,Arizona,24
3,Arkansas,72
4,California,171


In [49]:
# Merge the state counts with the population DataFrame
state_population = pd.merge(state_counts, population, on='state')

In [50]:
# Calculate high-quality articles per capita
state_population['high_quality_per_capita'] = state_population['high_quality_count'] / state_population['population']

In [56]:
state_population.head()

Unnamed: 0,state,high_quality_count,population,high_quality_per_capita
0,Alabama,106,5074296.0,2.1e-05
1,Alaska,31,733583.0,4.2e-05
2,Arizona,24,7359197.0,3e-06
3,Arkansas,72,3045637.0,2.4e-05
4,California,171,39029342.0,4e-06


In [51]:
# Sort in descending order and select the top 10 states
top_10_states = state_population.sort_values(by='high_quality_per_capita', ascending=False).head(10)

Displaying Table 3

In [53]:
top_10_states[['state', 'high_quality_per_capita']]

Unnamed: 0,state,high_quality_per_capita
42,Vermont,7e-05
47,Wyoming,6.7e-05
38,South Dakota,6.2e-05
45,West Virginia,6e-05
24,Montana,4.9e-05
26,New Hampshire,4.5e-05
35,Pennsylvania,4.4e-05
23,Missouri,4.2e-05
1,Alaska,4.2e-05
27,New Jersey,4.1e-05


### Table 4 - Bottom 10 US states by high quality i.e. 10 US states with the lowest high quality articles per capita 

In [58]:
# Sort in ascending order and select the bottom 10 states
bottom_10_states = state_population.sort_values(by='high_quality_per_capita').head(10)

Displaying Table 4

In [85]:
bottom_10_states

Unnamed: 0,state,high_quality_count,population,high_quality_per_capita
10,Illinois,21,12582032.0,2e-06
2,Arizona,24,7359197.0,3e-06
4,California,172,39029342.0,4e-06
7,Florida,120,22244823.0,5e-06
5,Colorado,83,5839926.0,1.4e-05
8,Hawaii,30,1440196.0,2.1e-05
0,Alabama,106,5074296.0,2.1e-05
9,Idaho,41,1939033.0,2.1e-05
3,Arkansas,72,3045637.0,2.4e-05
6,Delaware,25,1018396.0,2.5e-05


### Table 5 - Census divisions by total coverage i.e. a rank ordered list of US census divisions by total articles per capita

In [60]:
states_by_region = pd.read_csv('states_by_region.csv')

In [62]:
divisional = pd.merge(statewise_total_articles, states_by_region, on = "state")

In [66]:
df_div = divisional.groupby(by=['division'])['size'].sum().reset_index()

In [68]:
df_pop = divisional.groupby(by=['division'])['population'].sum().reset_index()

In [69]:
division_total_articles = pd.merge(df_div, df_pop, on='division')

In [76]:
division_total_articles.head()

Unnamed: 0,division,size,population,division_articles_per_capita
0,East North Central,4755,47097779.0,0.000101
1,East South Central,1992,19578002.0,0.000102
2,Middle Atlantic,2556,12972008.0,0.000197
3,Mountain,1105,23400976.0,4.7e-05
4,New England,1164,9014378.0,0.000129


In [73]:
division_total_articles['division_articles_per_capita'] = division_total_articles['size'] / division_total_articles['population']

Displaying Table 5

In [75]:
division_total_articles.sort_values('division_articles_per_capita', ascending=[0])[['division', 'division_articles_per_capita']]

Unnamed: 0,division,division_articles_per_capita
2,Middle Atlantic,0.000197
7,West North Central,0.000161
4,New England,0.000129
1,East South Central,0.000102
0,East North Central,0.000101
8,West South Central,5.1e-05
3,Mountain,4.7e-05
5,Pacific,2.4e-05
6,South Atlantic,2.3e-05


### Table 6 - Census divisions by high quality coverage i.e. a rank ordered list of US census divisions by high quality articles per capita

In [88]:
# Calculate the number of high-quality articles per division
division_counts = good_quality_df.groupby('state')['article_quality'].count().reset_index()
division_counts.rename(columns={'article_quality': 'high_quality_count'}, inplace=True)

census_divisions_df = pd.read_csv('states_by_region.csv')
census_divisions_df.rename(columns={'STATE': 'state'}, inplace=True)

# Merge division_counts with census_divisions_df to link divisions to states
census_divisions_with_counts = pd.merge(census_divisions_df, division_counts, on='state')

# Merge the census_divisions_with_counts DataFrame with the population data
census_divisions_with_population = pd.merge(census_divisions_with_counts, df_pop, on='state')

# Calculate high-quality articles per capita for each division
census_divisions_with_population['high_quality_per_capita'] = census_divisions_with_population['high_quality_count'] / census_divisions_with_population['population']

# Rank the census divisions by high-quality articles per capita in descending order
ranked_census_divisions = census_divisions_with_population.sort_values(by='high_quality_per_capita', ascending=False)

Displaying Table 6

In [89]:
ranked_census_divisions

Unnamed: 0,REGION,DIVISION,state,high_quality_count,population,high_quality_per_capita
8,West,Pacific,Alaska,31,733583.0,4.2e-05
1,South,South Atlantic,Delaware,25,1018396.0,2.5e-05
4,South,West South Central,Arkansas,72,3045637.0,2.4e-05
7,West,Mountain,Idaho,41,1939033.0,2.1e-05
3,South,East South Central,Alabama,106,5074296.0,2.1e-05
10,West,Pacific,Hawaii,30,1440196.0,2.1e-05
6,West,Mountain,Colorado,83,5839926.0,1.4e-05
2,South,South Atlantic,Florida,120,22244823.0,5e-06
9,West,Pacific,California,172,39029342.0,4e-06
5,West,Mountain,Arizona,24,7359197.0,3e-06
