In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


First, we load in the CSV we created previously with our required fields.

In [2]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/AUT 2023/DATA 512/wp_scored_city_articles_by_state.csv')
df.head()

Unnamed: 0,state,regional_division,population,article_title,revision_id,article_quality
0,Alabama,South_East South Central,5074296,"Abbeville, Alabama",1171163550,C
1,Alabama,South_East South Central,5074296,"Adamsville, Alabama",1177621427,C
2,Alabama,South_East South Central,5074296,"Addison, Alabama",1168359898,C
3,Alabama,South_East South Central,5074296,"Akron, Alabama",1165909508,GA
4,Alabama,South_East South Central,5074296,"Alabaster, Alabama",1179139816,C


# ANALYSIS 1

#Top 10 US states by coverage: The 10 US states with the highest total articles per capita (in descending order)

First, we group the dataframe by state, and do a simple count to get the total number of articles per state.

Thenw e group it by state and calculate the mean, to get the exact population of each state. We could also have done max or min, does not really matter as each state entry has same population data.

We then divide the count of articles by population to get the per capita article coverage per state.

In [93]:
statewise_count = df.groupby('state')['article_title'].count()
statewise_pop = df.groupby('state')['population'].mean()
statewise_coverage_per_capita = statewise_count/statewise_pop

We sort this by descending order

In [22]:
statewise_coverage_per_capita.sort_values(ascending=False,inplace=True)

This is the list of top 10 states with highest coverage of total article titles per capita.

In [23]:
top_state_coverage = statewise_coverage_per_capita.to_frame()
top_state_coverage = top_state_coverage.rename(columns={0:'Total Articles per Capita'})
top_state_coverage.head(10)

Unnamed: 0_level_0,Total Articles per Capita
state,Unnamed: 1_level_1
Vermont,0.000507
North Dakota,0.000457
Maine,0.000349
South Dakota,0.000342
Iowa,0.000326
Alaska,0.000203
Pennsylvania,0.000197
Michigan,0.000177
Wyoming,0.00017
New Hampshire,0.000168


# ANALYSIS 2

#Bottom 10 US states by coverage: The 10 US states with the lowest total articles per capita (in ascending order) .

We use the per capita statewise articles generated earlier. This time, we just sort in the ascending order.

In [24]:
statewise_coverage_per_capita.sort_values(inplace=True)

These are the bottom 10 states with lowest article coverage per capita.

In [25]:
bottom_state_coverage = statewise_coverage_per_capita.to_frame()
bottom_state_coverage = bottom_state_coverage.rename(columns={0:'Total Articles per Capita'})
bottom_state_coverage.head(10)

Unnamed: 0_level_0,Total Articles per Capita
state,Unnamed: 1_level_1
North Carolina,5e-06
Nevada,6e-06
California,1.2e-05
Arizona,1.2e-05
Virginia,1.5e-05
Florida,1.8e-05
Oklahoma,1.9e-05
Kansas,2.1e-05
Maryland,2.5e-05
Wisconsin,3.2e-05


# ANALYSIS 3

# Top 10 US states by high quality: The 10 US states with the highest high quality articles per capita (in descending order) .

For this, we first filter the data to include only 'high quality' articles, i.e.e articles with predicted quality as 'FA' (Featured) or 'GA' (Good).

Then we basically run the same process as earlier. Grouping by state and taking count to get total number of articles, grouping by state and taking mean to get population counts. Then we divide this to get per capita articles.

In [27]:
df_hq = df[(df['article_quality']=="FA") | (df['article_quality']=="GA")]
df_hq.head()

Unnamed: 0,state,regional_division,population,article_title,revision_id,article_quality
3,Alabama,South_East South Central,5074296,"Akron, Alabama",1165909508,GA
6,Alabama,South_East South Central,5074296,"Alexander City, Alabama",1179140073,GA
7,Alabama,South_East South Central,5074296,"Aliceville, Alabama",1167792390,GA
14,Alabama,South_East South Central,5074296,"Ardmore, Alabama",1176903479,GA
33,Alabama,South_East South Central,5074296,"Bear Creek, Alabama",1166015184,GA


In [30]:
hq_statewise_count = df_hq.groupby('state')['article_title'].count()
hq_statewise_pop = df_hq.groupby('state')['population'].mean()
hq_statewise_coverage_per_capita = hq_statewise_count/hq_statewise_pop

Sort in descending order

In [31]:
hq_statewise_coverage_per_capita.sort_values(ascending=False,inplace=True)

This is the list of top 10 states with highest number of high quality articles per capita.

In [32]:
hq_top_state_coverage = hq_statewise_coverage_per_capita.to_frame()
hq_top_state_coverage = hq_top_state_coverage.rename(columns={0:'Total Articles per Capita'})
hq_top_state_coverage.head(10)

Unnamed: 0_level_0,Total Articles per Capita
state,Unnamed: 1_level_1
Vermont,7e-05
Wyoming,6.7e-05
South Dakota,6.2e-05
West Virginia,6e-05
Montana,4.9e-05
New Hampshire,4.5e-05
Pennsylvania,4.4e-05
Missouri,4.3e-05
Alaska,4.2e-05
New Jersey,4.1e-05


# ANALYSIS 4

# Bottom 10 US states by high quality: The 10 US states with the lowest high quality articles per capita (in ascending order).

We use the dataframe generated earlier and just sort in an ascending order.

In [33]:
hq_statewise_coverage_per_capita.sort_values(inplace=True)

This is the list of 10 states with least number of high quality articles per capita.

In [34]:
hq_bottom_state_coverage = hq_statewise_coverage_per_capita.to_frame()
hq_bottom_state_coverage = hq_bottom_state_coverage.rename(columns={0:'Total Articles per Capita'})
hq_bottom_state_coverage.head(10)

Unnamed: 0_level_0,Total Articles per Capita
state,Unnamed: 1_level_1
North Carolina,2e-06
Virginia,2e-06
Nevada,3e-06
Arizona,3e-06
California,4e-06
Florida,5e-06
New York,6e-06
Maryland,7e-06
Kansas,7e-06
Oklahoma,8e-06


# ANALYSIS 5

# Census divisions by total coverage: A rank ordered list of US census divisions (in descending order) by total articles per capita.

The process to do this is slightly more complex as we have population data per state, not per regional_division, so we need to manipulate the data a bit.

First, we group by state, take the mean to get the population per state, and append the corresponding regional division to it.

In [75]:
df_state = df.groupby('state').mean()
df_state.reset_index(inplace=True)
regions = []
for i in range(len(df_state)):
  st = df_state.iloc[i]['state']
  regions.append(df[df['state']==st].iloc[0]['regional_division'])
df_state['regional_division'] = regions

  df_state = df.groupby('state').mean()


We then group by state again and get a count of the total articles per state.

In [76]:
df_state_count = df.groupby('state').count()
df_state_count.reset_index(inplace=True)
df_state_count.drop(['regional_division','population','revision_id','article_quality'],axis=1,inplace=True)

We then merge these to get the statewise population, regional divisions and article counts.

In [77]:
reg_divs = pd.merge(df_state,df_state_count,left_on='state',right_on='state',how='inner')
reg_divs

Unnamed: 0,state,population,revision_id,regional_division,article_title
0,Alabama,5074296.0,1165822000.0,South_East South Central,461
1,Alaska,733583.0,1162692000.0,West_Pacific,149
2,Arizona,7359197.0,1167098000.0,West_Mountain,91
3,Arkansas,3045637.0,1166407000.0,South_West South Central,500
4,California,39029342.0,1175018000.0,West_Pacific,482
5,Colorado,5839926.0,1170903000.0,West_Mountain,288
6,Delaware,1018396.0,1168784000.0,South_South Atlantic,57
7,Florida,22244823.0,1172121000.0,South_South Atlantic,411
8,Georgia,10912876.0,1166134000.0,South_South Atlantic,538
9,Hawaii,1440196.0,1044715000.0,West_Pacific,150


Now we can get region per capita data. First we group by region and do a sum to get total population in that region. Then we group by region and do a sum to gte total number of articles of that region. We can divide these to get per capita counts.

In [87]:
region_pop = reg_divs.groupby('regional_division')['population'].sum()
region_count = reg_divs.groupby('regional_division')['article_title'].sum()
regionwise_coverage_per_capita = region_count/region_pop

Sort in descending order.

In [88]:
regionwise_coverage_per_capita.sort_values(ascending=False,inplace=True)

This is the ranked list of regions in descending order or per capita article coverages.

In [89]:
regionwise_coverage = regionwise_coverage_per_capita.to_frame()
regionwise_coverage = regionwise_coverage.rename(columns={0:'Total Articles per Capita'})
regionwise_coverage['Rank'] = [1,2,3,4,5,6,7,8,9]
regionwise_coverage

Unnamed: 0_level_0,Total Articles per Capita,Rank
regional_division,Unnamed: 1_level_1,Unnamed: 2_level_1
Midwest_West North Central,0.000181,1
Northeast_New England,0.000125,2
Midwest_East North Central,0.000101,3
Northeast_Middle Atlantic,9e-05,4
South_East South Central,7.8e-05,5
South_West South Central,5e-05,6
West_Mountain,4.7e-05,7
South_South Atlantic,2.8e-05,8
West_Pacific,2.4e-05,9


# ANALYSIS 6

# Census divisions by high quality coverage: Rank ordered list of US census divisions (in descending order) by high quality articles per capita.

We follow the same process as earlier, but use the datafram created earlier which is filtered to include only 'FA' (featured) and 'GA' (Good) articles.

Then groupby state and then by region to get regionwise per capita values.

In [81]:
df_hq_state = df_hq.groupby('state').mean()
df_hq_state.reset_index(inplace=True)
regions_hq = []
for i in range(len(df_hq_state)):
  st_hq = df_hq_state.iloc[i]['state']
  regions_hq.append(df_hq[df_hq['state']==st_hq].iloc[0]['regional_division'])
df_hq_state['regional_division'] = regions_hq

  df_hq_state = df_hq.groupby('state').mean()


In [82]:
df_hq_state_count = df_hq.groupby('state').count()
df_hq_state_count.reset_index(inplace=True)
df_hq_state_count.drop(['regional_division','population','revision_id','article_quality'],axis=1,inplace=True)

In [83]:
reg_divs_hq = pd.merge(df_hq_state,df_hq_state_count,left_on='state',right_on='state',how='inner')
reg_divs_hq

Unnamed: 0,state,population,revision_id,regional_division,article_title
0,Alabama,5074296.0,1167203000.0,South_East South Central,53
1,Alaska,733583.0,1172693000.0,West_Pacific,31
2,Arizona,7359197.0,1171230000.0,West_Mountain,24
3,Arkansas,3045637.0,1167011000.0,South_West South Central,72
4,California,39029342.0,1173776000.0,West_Pacific,173
5,Colorado,5839926.0,1171268000.0,West_Mountain,76
6,Delaware,1018396.0,1168671000.0,South_South Atlantic,25
7,Florida,22244823.0,1173740000.0,South_South Atlantic,118
8,Georgia,10912876.0,1168248000.0,South_South Atlantic,93
9,Hawaii,1440196.0,1154876000.0,West_Pacific,30


In [90]:
hq_regionwise_count = reg_divs_hq.groupby('regional_division')['article_title'].sum()
hq_regionwise_pop = reg_divs_hq.groupby('regional_division')['population'].sum()
hq_regionwise_coverage_per_capita = hq_regionwise_count/hq_regionwise_pop

Sort in descending order

In [91]:
hq_regionwise_coverage_per_capita.sort_values(ascending=False,inplace=True)

This is the ranked list of regional divisions in descending order of per capita high quality article coverage.

In [92]:
hq_regionwise_coverage = hq_regionwise_coverage_per_capita.to_frame()
hq_regionwise_coverage = hq_regionwise_coverage.rename(columns={0:'Total Articles per Capita'})
hq_regionwise_coverage['Rank'] = [1,2,3,4,5,6,7,8,9]
hq_regionwise_coverage

Unnamed: 0_level_0,Total Articles per Capita,Rank
regional_division,Unnamed: 1_level_1,Unnamed: 2_level_1
Midwest_West North Central,3.2e-05,1
Northeast_Middle Atlantic,2.5e-05,2
Northeast_New England,2e-05,3
South_East South Central,1.6e-05,4
Midwest_East North Central,1.5e-05,5
South_West South Central,1.5e-05,6
West_Mountain,1.3e-05,7
West_Pacific,9e-06,8
South_South Atlantic,8e-06,9
