# Loading Libraries and Dataset

In [126]:
# Importing Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime

import json, time, urllib.parse
import requests

In [127]:
df = pd.read_csv('wp_politicians_by_country.csv')
df.shape

(7514, 6)

In [128]:
df.head()

Unnamed: 0,article_title,country,lastrevid,article_quality,population,region
0,Shahjahan Noori,Afghanistan,1099689000.0,GA,41.1,SOUTH ASIA
1,Abdul Ghafar Lakanwal,Afghanistan,943562300.0,Start,41.1,SOUTH ASIA
2,Majah Ha Adrif,Afghanistan,852404100.0,Start,41.1,SOUTH ASIA
3,Haroon al-Afghani,Afghanistan,1095102000.0,B,41.1,SOUTH ASIA
4,Tayyab Agha,Afghanistan,1104998000.0,Start,41.1,SOUTH ASIA


# 1. Data Analysis

### 1.1 Finding the total-articles-per-population at region level

In [129]:
# Dropping duplicate country rows to calculate population of region
df_region_subset = df.drop_duplicates(subset=['population', 'region', 'country'])

# groupby at region level to get sum of population
groupby_region_population = df_region_subset[['population', 'region', 'country']].groupby(['region']).sum()['population']

# merging the above two dataframes
df_region_subset = df_region_subset.merge(groupby_region_population, on = 'region', how = 'inner')
df_region_subset = df_region_subset.rename(columns = {'population_y': 'region_level_population'})

# groupby at region level to get total articles in region
grouby_region_article = df_region_subset[['region', 'article_title']].groupby(['region']).nunique()

# merging dataframes
df_region_subset = df_region_subset.merge(grouby_region_article, on = 'region', how = 'inner')
df_region_subset = df_region_subset.rename(columns = {'article_title_y': 'article_title_count'})

In [130]:
# calculating total per capita value
df_region_subset.drop(['article_title_x', 'lastrevid', 'article_quality', 'population_x'], axis=1, inplace=True)
df_region_subset['articles_per_capita_region'] = df_region_subset['article_title_count'] / ((df_region_subset['region_level_population'] * 1000000))
df_region_subset = df_region_subset.sort_values(by = ['articles_per_capita_region'], ascending = False)

### 1.2 Finding the total-articles-per-population at country level

In [131]:
# groupby at country level to get total articles in country
grouby_country_article = df[['country', 'article_title']].groupby(['country']).nunique()

# merging dataframes
df_country_subset = df.merge(grouby_country_article, on = 'country', how = 'inner')
df_country_subset = df_country_subset.rename(columns = {'article_title_y': 'article_title_count'})

In [132]:
df_country_subset.head()

Unnamed: 0,article_title_x,country,lastrevid,article_quality,population,region,article_title_count
0,Shahjahan Noori,Afghanistan,1099689000.0,GA,41.1,SOUTH ASIA,118
1,Abdul Ghafar Lakanwal,Afghanistan,943562300.0,Start,41.1,SOUTH ASIA,118
2,Majah Ha Adrif,Afghanistan,852404100.0,Start,41.1,SOUTH ASIA,118
3,Haroon al-Afghani,Afghanistan,1095102000.0,B,41.1,SOUTH ASIA,118
4,Tayyab Agha,Afghanistan,1104998000.0,Start,41.1,SOUTH ASIA,118


In [133]:
# calculating total per capita value
df_country_subset.drop(['article_title_x', 'lastrevid', 'article_quality'], axis=1, inplace=True)
df_country_subset['articles_per_capita_country'] = df_country_subset['article_title_count'] / (df_country_subset['population'] * 1000000)
df_country_subset = df_country_subset.sort_values(by = ['articles_per_capita_country'])

### 1.3 Storing final results of 1.1 and 1.2

In [134]:
article_percapita_country = df_country_subset[['country', 'articles_per_capita_country']].drop_duplicates().reset_index(drop=True )
article_percapita_region = df_region_subset[['region', 'articles_per_capita_region']].drop_duplicates().reset_index(drop=True)

### 1.4 Finding the total-articles-per-population of top articles at region level

In [167]:
df_toparticles = df.loc[(df['article_quality'] == 'FA') | (df['article_quality'] == 'GA')]

In [168]:
df_toparticles.head()

Unnamed: 0,article_title,country,lastrevid,article_quality,population,region
0,Shahjahan Noori,Afghanistan,1099689000.0,GA,41.1,SOUTH ASIA
55,Ahmed Wali Karzai,Afghanistan,1090246000.0,GA,41.1,SOUTH ASIA
59,Masoud Khalili,Afghanistan,1103105000.0,GA,41.1,SOUTH ASIA
93,Amrullah Saleh,Afghanistan,1115023000.0,FA,41.1,SOUTH ASIA
107,Nur ul-Haq Ulumi,Afghanistan,1107429000.0,GA,41.1,SOUTH ASIA


In [163]:
# Dropping duplicate country rows to calculate population of region
dftop_region_subset = df_toparticles.drop_duplicates(subset=['region', 'country'])

# groupby at region level to get sum of population
groupby_region_population_top = dftop_region_subset[['population', 'region']].groupby(['region']).sum()['population']

# merging the above two dataframes
dftop_region_subset = dftop_region_subset.merge(groupby_region_population_top, on = 'region', how = 'inner')
dftop_region_subset = dftop_region_subset.rename(columns = {'population_y': 'region_level_population'})

# groupby at region level to get total articles in region
grouby_region_article_top = dftop_region_subset[['region', 'article_title']].groupby(['region']).nunique()

# merging dataframes
dftop_region_subset = dftop_region_subset.merge(grouby_region_article_top, on = 'region', how = 'inner')
dftop_region_subset = dftop_region_subset.rename(columns = {'article_title_y': 'article_title_count'})

In [137]:
# calculating total per capita value
dftop_region_subset.drop(['article_title_x', 'lastrevid', 'article_quality', 'population_x'], axis=1, inplace=True)
dftop_region_subset['articles_per_capita_region'] = dftop_region_subset['article_title_count'] / (dftop_region_subset['region_level_population'] * 1000000)
dftop_region_subset = dftop_region_subset.sort_values(by = ['articles_per_capita_region'], ascending = False)

In [138]:
toparticle_percapita_region = dftop_region_subset[['region', 'articles_per_capita_region']].drop_duplicates().reset_index(drop=True)

### 1.5 Finding the total-articles-per-population of top articles at country level

In [139]:
# groupby at country level to get total articles in country
grouby_country_article_top = df_toparticles[['country', 'article_title']].groupby(['country']).nunique()

# merging dataframes
dftop_country_subset = df_toparticles.merge(grouby_country_article_top, on = 'country', how = 'inner')
dftop_country_subset = dftop_country_subset.rename(columns = {'article_title_y': 'article_title_count'})

In [140]:
# calculating total per capita value
dftop_country_subset.drop(['article_title_x', 'lastrevid', 'article_quality'], axis=1, inplace=True)
dftop_country_subset['articles_per_capita_country'] = dftop_country_subset['article_title_count'] /(dftop_country_subset['population'] * 1000000)
dftop_country_subset = dftop_country_subset.sort_values(by = ['articles_per_capita_country'])

In [141]:
toparticle_percapita_country = dftop_country_subset[['country', 'articles_per_capita_country']].drop_duplicates().reset_index(drop=True)

# 5. Performing Data Analysis

#### Plot 1: Top 10 countries by coverage
The 10 countries with the highest total articles per capita (in descending order)

In [151]:
article_percapita_country.loc[article_percapita_country['articles_per_capita_country']!= np.inf].sort_values(by = 'articles_per_capita_country', ascending=False).head(10)

Unnamed: 0,country,articles_per_capita_country
177,Antigua and Barbuda,0.00017
176,Federated States of Micronesia,0.00013
175,Andorra,0.0001
174,Barbados,9.3e-05
173,Marshall Islands,9e-05
172,Seychelles,6e-05
171,Montenegro,6e-05
170,Luxembourg,5.3e-05
169,Bhutan,5.1e-05
168,Grenada,5e-05


#### Plot 2: Bottom 10 countries by coverage
The 10 countries with the lowest total articles per capita (in ascending order)

In [152]:
article_percapita_country.loc[article_percapita_country['articles_per_capita_country']!= np.inf].sort_values(by = 'articles_per_capita_country', ascending=True).head(10)

Unnamed: 0,country,articles_per_capita_country
0,China,1.392176e-09
1,Mexico,7.843137e-09
2,Saudi Arabia,8.174387e-08
3,Romania,1.052632e-07
4,India,1.263054e-07
5,Sri Lanka,1.339286e-07
6,Egypt,1.352657e-07
7,Ethiopia,2.025932e-07
8,Taiwan,2.155172e-07
9,Vietnam,2.716298e-07


#### Plot 3: Top 10 countries by high quality
The 10 countries with the highest high quality articles per capita (in descending order)

In [155]:
toparticle_percapita_country.loc[toparticle_percapita_country['articles_per_capita_country']!=np.inf].sort_values(by = 'articles_per_capita_country', ascending=False).head(10)

Unnamed: 0,country,articles_per_capita_country
91,Andorra,2e-05
90,Montenegro,5e-06
89,Albania,2.142857e-06
88,Suriname,1.666667e-06
87,Bosnia-Herzegovina,1.470588e-06
86,Lithuania,1.071429e-06
85,Croatia,1.052632e-06
84,Slovenia,9.52381e-07
83,Palestinian Territory,9.259259e-07
82,Gabon,8.333333e-07


#### Plot 4: Bottom 10 countries by high quality
The 10 countries with the lowest high quality articles per capita (in ascending order)

In [164]:
toparticle_percapita_country.sort_values(by = 'articles_per_capita_country', ascending=True).head(10)

Unnamed: 0,country,articles_per_capita_country
0,India,4.2337e-09
1,Thailand,1.497006e-08
2,Japan,1.601281e-08
3,Nigeria,1.830664e-08
4,Vietnam,2.012072e-08
5,Colombia,2.03666e-08
6,Uganda,2.118644e-08
7,Pakistan,2.120441e-08
8,Sudan,2.132196e-08
9,Iran,2.257336e-08


#### Plot 5: Geographic regions by total coverage
A rank ordered list of geographic regions (in descending order) by total articles per capita

In [166]:
article_percapita_region.sort_values(by = 'articles_per_capita_region', ascending=False)

Unnamed: 0,region,articles_per_capita_region
17,EAST ASIA,3.601873e-09
16,SOUTH ASIA,4.480733e-09
15,SOUTHEAST ASIA,1.608004e-08
14,NORTHERN AFRICA,2.394254e-08
13,SOUTH AMERICA,2.767528e-08
12,EASTERN EUROPE,3.479471e-08
11,EASTERN AFRICA,3.614714e-08
10,WESTERN AFRICA,3.722662e-08
9,MIDDLE AFRICA,4.083716e-08
8,CENTRAL AMERICA,4.496908e-08


#### Plot 6: Geographic regions by high quality coverage
Rank ordered list of geographic regions (in descending order) by high quality articles per capita.

In [160]:
toparticle_percapita_country[toparticle_percapita_country['articles_per_capita_country']!=np.inf].sort_values(by = 'articles_per_capita_country', ascending=False)

Unnamed: 0,country,articles_per_capita_country
91,Andorra,2.000000e-05
90,Montenegro,5.000000e-06
89,Albania,2.142857e-06
88,Suriname,1.666667e-06
87,Bosnia-Herzegovina,1.470588e-06
...,...,...
4,Vietnam,2.012072e-08
3,Nigeria,1.830664e-08
2,Japan,1.601281e-08
1,Thailand,1.497006e-08
