## ICLab Data Files Exploratory Analysis

In [1]:
import pandas as pd
import tldextract
import numpy as np
import json
import math
import pycountry
import itertools

In [2]:
iclab = pd.read_csv("cleaned_data/iclab_2018-09.csv")
iclab.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,filename,server_t,country,as_number,schedule_name,url,dns,dns_reason,dns_all,dns_reason_all,http_status,block,body_len,http_reason,packet_updated,packet_reason,censored_updated
0,baseline-2018-09-01T000154.371069.json.bz2,2018-09-01T00:01:54.354Z,US,1249.0,country-sensitive-at,http://kinox.to/,,,,,403.0,False,3094.0,{},,,False
1,baseline-2018-09-01T000154.371069.json.bz2,2018-09-01T00:01:54.354Z,US,1249.0,country-sensitive-at,http://movie4k.to/,,,,,403.0,False,3098.0,{},,,False
2,baseline-2018-10-02T011527.678225.json.bz2,2018-09-01T00:02:11.575Z,KR,4766.0,country-sensitive-kr,4shared.com,False,,False,,,,,,,,False
3,baseline-2018-10-02T011527.678225.json.bz2,2018-09-01T00:02:11.575Z,KR,4766.0,country-sensitive-kr,news.bbc.co.uk,False,,False,,,,,,,,False
4,baseline-2018-10-02T011527.678225.json.bz2,2018-09-01T00:02:11.575Z,KR,4766.0,country-sensitive-kr,ngt.jinbo.net,False,,False,,,,,,,,False


For now, we only care about the 2-digit country code *country*, *url*, and *censored_updated*. Let's filter a cleaner table with just these columns.

In [5]:
iclab_clean = iclab[["country", "url", "censored_updated"]]
iclab_clean.head()

Unnamed: 0,country,url,censored_updated
0,US,http://kinox.to/,False
1,US,http://movie4k.to/,False
2,KR,4shared.com,False
3,KR,news.bbc.co.uk,False
4,KR,ngt.jinbo.net,False


We need to redo the combined_similarities.json and individual similarities files. First, let's find what countries are in the dataset.

In [6]:
iclab_clean.country.unique()

array(['US', 'KR', 'ES', 'ZA', 'CZ', 'PL', 'MY', 'RU', 'CN', 'TW', 'BG',
       'HK', 'RO', 'PE', 'HU', 'NO', 'MX', 'UA', 'NL', 'VN', 'JP', 'LT',
       'RS', 'AU', 'KE', 'SK', 'IN', 'CL', 'CA', 'LI', 'SG', 'ID', 'NZ',
       'LU', 'BZ', 'CO', 'TR', 'BR', 'SE', 'IS', 'FI', 'DZ', 'PT', 'DK',
       'IL', 'MD', 'AT', 'SC'], dtype=object)

In [7]:
cc_unique = iclab_clean.country.unique()

In [45]:
country_codes = pd.read_csv("https://raw.githubusercontent.com/daylight-lab/III/master/shared/data/country-codes/countries_codes_and_coordinates.csv").replace('"','', regex=True)
country_codes.head()

Unnamed: 0,Country,Alpha-2 code,Alpha-3 code,Numeric code,Latitude (average),Longitude (average)
0,Afghanistan,AF,AFG,4,33.0,65.0
1,Albania,AL,ALB,8,41.0,20.0
2,Algeria,DZ,DZA,12,28.0,3.0
3,American Samoa,AS,ASM,16,-14.3333,-170.0
4,Andorra,AD,AND,20,42.5,1.6


In [46]:
country_codes.iloc[0]

Country                Afghanistan
Alpha-2 code                    AF
Alpha-3 code                   AFG
Numeric code                     4
Latitude (average)              33
Longitude (average)             65
Name: 0, dtype: object

In [47]:
country_codes.columns

Index(['Country', 'Alpha-2 code', 'Alpha-3 code', 'Numeric code',
       'Latitude (average)', 'Longitude (average)'],
      dtype='object')

In [48]:
country_codes.dtypes

Country                object
Alpha-2 code           object
Alpha-3 code           object
Numeric code           object
Latitude (average)     object
Longitude (average)    object
dtype: object

In [50]:
country_codes['Alpha-2 code'] = country_codes['Alpha-2 code'].str.strip()

In [51]:
country_codes.loc[country_codes["Alpha-2 code"] == "AF"]

Unnamed: 0,Country,Alpha-2 code,Alpha-3 code,Numeric code,Latitude (average),Longitude (average)
0,Afghanistan,AF,AFG,4,33,65


In [55]:
all_countries = [country_codes.loc[country_codes["Alpha-2 code"] == u].iloc[0].Country for u in iclab_clean.country.unique()]
all_countries

['United States',
 'Korea, Republic of',
 'Spain',
 'South Africa',
 'Czech Republic',
 'Poland',
 'Malaysia',
 'Russian Federation',
 'China',
 'Taiwan, Province of China',
 'Bulgaria',
 'Hong Kong',
 'Romania',
 'Peru',
 'Hungary',
 'Norway',
 'Mexico',
 'Ukraine',
 'Netherlands',
 'Viet Nam',
 'Japan',
 'Lithuania',
 'Serbia',
 'Australia',
 'Kenya',
 'Slovakia',
 'India',
 'Chile',
 'Canada',
 'Liechtenstein',
 'Singapore',
 'Indonesia',
 'New Zealand',
 'Luxembourg',
 'Belize',
 'Colombia',
 'Turkey',
 'Brazil',
 'Sweden',
 'Iceland',
 'Finland',
 'Algeria',
 'Portugal',
 'Denmark',
 'Israel',
 'Moldova, Republic of',
 'Austria',
 'Seychelles']

In [65]:
iclab_clean["domain"] = [tldextract.extract(i).domain for i in iclab_clean["url"]]
iclab_clean["suffix"] = [tldextract.extract(i).suffix for i in iclab_clean["url"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [66]:
iclab_clean.head()

Unnamed: 0,country,url,censored_updated,domain,suffix
0,US,http://kinox.to/,False,kinox,to
1,US,http://movie4k.to/,False,movie4k,to
2,KR,4shared.com,False,4shared,com
3,KR,news.bbc.co.uk,False,bbc,co.uk
4,KR,ngt.jinbo.net,False,jinbo,net


In [77]:
# recalculate combined_similarities.json
combined_similarities = {}

def correct_country(c):
    if c == "United States":
        c = "United States of America"
    if c == 'Korea, Republic of':
        c = "South Korea"
    if c == 'Russian Federation':
        c = "Russia"
    if c == 'Taiwan, Province of China':
        c = "Taiwan"
    if c == 'Viet Nam':
        c = "Vietnam"
    if c == 'Moldova, Republic of':
        c = "Moldova"
    return c

In [None]:
# format: "('Austria', 'BEL')": [{"similarity": 0.0496579382}]
for c1 in cc_unique:
    new_common_domains = pd.DataFrame(columns = ["Country 1", "Country 2", "Domain", "Suffix"])
    for c2 in cc_unique:
        if c1 != c2:
            country_1 = correct_country(country_codes.loc[country_codes["Alpha-2 code"] == c1].iloc[0].Country.strip())
            country_2 = correct_country(country_codes.loc[country_codes["Alpha-2 code"] == c2].iloc[0].Country.strip())
            
            c1_rows = iclab_clean.loc[iclab_clean["country"] == c1]
            c2_rows = iclab_clean.loc[iclab_clean["country"] == c2]
            
            joined_c1_c2 = pd.concat([c1_rows, c2_rows], axis = 0)
            joined_c1_c2["combined_site"] = joined_c1_c2["domain"] + "." + joined_c1_c2["suffix"]
            
            unique_sites = np.unique(joined_c1_c2["combined_site"])
            
            blocked_c1 = joined_c1_c2.loc[(joined_c1_c2["country"] == c1) & (joined_c1_c2["censored_updated"])]
            blocked_c2 = joined_c1_c2.loc[(joined_c1_c2["country"] == c2) & (joined_c1_c2["censored_updated"])]
            commonly_blocked = blocked_c1.merge(blocked_c2, how = 'inner', on = 'combined_site')
            blocked_both = commonly_blocked['combined_site'].unique()
            num_blocked_both = len(blocked_both)
            for b in blocked_both:
                suffix = b.split(".")[-1]
                domain = b[:(b.index(suffix) - 1)]
                new_common_domains = new_common_domains.append({"Country 1": country_1, "Country 2": country_2, "Domain": domain, "Suffix": suffix}, ignore_index = True)
            print(c1, c2, num_blocked_both)
            similarity = num_blocked_both / (len(unique_sites))
            
            country_2_alpha_2 = country_codes.loc[country_codes["Alpha-2 code"] == c2].iloc[0]["Alpha-3 code"].strip()
            if not country_2_alpha_2:
                country_2_alpha_2 = "MDA"
            k = "('" + country_1 + "', '" + country_2_alpha_2 + "')"
            combined_similarities[k] = [{"similarity": similarity}]
            print(k, similarity)
    new_common_domains.to_csv("new_common_domains/" + country_1 + "-common-domains.csv")

In [109]:
with open('../new-combined-similarities.json', 'w') as json_file:
    json.dump(combined_similarities, json_file)

In [111]:
# what are the unique urls being tested?
len(iclab_clean['url'].unique())

22717

In [31]:
iclab.loc[(iclab['http_status'] != 200)].dropna().head()

Unnamed: 0,filename,server_t,country,as_number,schedule_name,url,dns,dns_reason,dns_all,dns_reason_all,http_status,block,body_len,http_reason,packet_updated,packet_reason,censored_updated
7748,baseline-2018-09-01T012507.257001.json.bz2,2018-09-01T01:25:07.161Z,US,198605.0,citizenlab-global,http://masrawy.com/,-2,no_control_resp,-2,no_control_resp,403.0,False,3104.0,{},False,ICMP unreachable,False
8391,baseline-2018-09-01T012507.257001.json.bz2,2018-09-01T01:25:07.161Z,US,198605.0,citizenlab-global,http://www.amateurpages.com/,false,sameip,false,sameip,-1.0,False,0.0,{},False,Handshake conflict,False
9566,baseline-2018-09-01T012507.257001.json.bz2,2018-09-01T01:25:07.161Z,US,198605.0,citizenlab-global,http://advocacy.globalvoicesonline.org/,false,sameip,false,sameip,-1.0,False,0.0,{},True,ICMP admin prohibition,True
18619,baseline-2018-09-01T031910.129586.json.bz2,2018-09-01T03:19:10.096Z,US,54455.0,citizenlab-global,http://www.pc2call.com/,false,sameip,false,sameip,403.0,False,162.0,{},False,Handshake conflict,False
19753,baseline-2018-09-01T031910.129586.json.bz2,2018-09-01T03:19:10.096Z,US,54455.0,citizenlab-global,http://www.womeninblack.org/,false,sameip,false,sameip,403.0,False,410.0,{},False,ICMP unreachable,False


In [30]:
iclab.loc[iclab['http_status'] == 200].head()

Unnamed: 0,filename,server_t,country,as_number,schedule_name,url,dns,dns_reason,dns_all,dns_reason_all,http_status,block,body_len,http_reason,packet_updated,packet_reason,censored_updated
657,baseline-2018-09-01T002206.909253.json.bz2,2018-09-01T00:02:11.575Z,KR,4766.0,citizenlab-global,http://www.acquisitionx.com/,false,sameip,False,,200.0,False,8419.0,{},,,False
659,baseline-2018-09-01T002206.909253.json.bz2,2018-09-01T00:02:11.575Z,KR,4766.0,citizenlab-global,http://www.lingerieatlarge.com/,false,sameip,False,,200.0,False,29282.0,{},False,Handshake conflict,False
660,baseline-2018-09-01T002206.909253.json.bz2,2018-09-01T00:02:11.575Z,KR,4766.0,citizenlab-global,http://www.pokerstars.net/,false,sameip,False,,200.0,False,23591.0,{},,,False
661,baseline-2018-09-01T002206.909253.json.bz2,2018-09-01T00:02:11.575Z,KR,4766.0,citizenlab-global,http://slickdeals.net/,-2,no_control_resp,False,,200.0,False,679791.0,{},,,False
665,baseline-2018-09-01T002206.909253.json.bz2,2018-09-01T00:02:11.575Z,KR,4766.0,citizenlab-global,http://www.gamingday.com/,false,sameip,False,,200.0,False,26428.0,{},False,Handshake conflict,False


In [29]:
iclab.loc[iclab['censored_updated'] == True].head()

Unnamed: 0,filename,server_t,country,as_number,schedule_name,url,dns,dns_reason,dns_all,dns_reason_all,http_status,block,body_len,http_reason,packet_updated,packet_reason,censored_updated
823,baseline-2018-09-01T002206.909253.json.bz2,2018-09-01T00:02:11.575Z,KR,4766.0,citizenlab-global,http://redtube.com/,false,sameip,False,,200.0,True,437.0,HTTP body contains [http://warning.or.kr],,,True
870,baseline-2018-09-01T002206.909253.json.bz2,2018-09-01T00:02:11.575Z,KR,4766.0,citizenlab-global,http://warc.jalb.de/,false,sameip,False,,-1.0,False,0.0,{},True,ICMP admin prohibition,True
985,baseline-2018-09-01T002206.909253.json.bz2,2018-09-01T00:02:11.575Z,KR,4766.0,citizenlab-global,http://backpage.com/,-2,no_control_resp,False,,200.0,True,437.0,HTTP body contains [http://warning.or.kr],,,True
997,baseline-2018-09-01T002206.909253.json.bz2,2018-09-01T00:02:11.575Z,KR,4766.0,citizenlab-global,http://pridetube.com/,false,sameip,False,,200.0,True,437.0,HTTP body contains [http://warning.or.kr],True,Payload collision,True
1030,baseline-2018-09-01T002206.909253.json.bz2,2018-09-01T00:02:11.575Z,KR,4766.0,citizenlab-global,http://xvideos.com/,false,sameip,False,,200.0,True,437.0,HTTP body contains [http://warning.or.kr],True,Payload collision,True


In [9]:
# packeted_updated : packet_reason, block : http_reason, dns : dns_reason
print("packet reasons", iclab.packet_reason.unique())
print()
print("http reasons", iclab.http_reason.unique())
print()
print("dns reasons", iclab.dns_reason.unique())

packet reasons [nan 'Handshake conflict' 'ICMP unreachable' 'ICMP admin prohibition'
 'Payload collision' 'Data after FIN' 'Data after RST']

 "HTTP body contains [\\\\u003cFRAME SRC='http://www.ifydnun.com/?dn=]"
 'HTTP body contains [\\u003ctitle\\u003eWeb Site Blocked\\u003c/title\\u003e]'
 'HTTP body contains [\\\\u003cTITLE\\\\u003eAccess Denied\\\\u003c/TITLE\\\\u003e]'
 'HTTP body contains [\\u003ctitle\\u003eAccess Denied\\u003c/title\\u003e]'
 'HTTP body contains [\\\\u003ch1\\\\u003eAccess To Website Blocked\\\\u003c/h1\\\\u003e]'
 'HTTP body contains [\\\\u003cimg src=\\""//cacheimg.casino.com/images/www/forbidden/forbidden-logo.png]'
 'HTTP body contains [\\\\u003ctitle\\\\u003eSucuri WebSite Firewall - Access Denied\\\\u003c/title\\\\u003e]'
 'HTTP body contains [The page is blocked due to the decision of the authorities in your area.]'
 'HTTP body contains [http://zapret-info.gov.ru/]' '[]'
 'HTTP body contains [http://eais.rkn.gov.ru/]'
 'HTTP body contains [The page you

In [28]:
iclab.loc[(iclab['censored_updated'] == True) & (iclab['country'] == 'US')].head()

Unnamed: 0,filename,server_t,country,as_number,schedule_name,url,dns,dns_reason,dns_all,dns_reason_all,http_status,block,body_len,http_reason,packet_updated,packet_reason,censored_updated
4401,baseline-2018-09-01T000348.154035.json.bz2,2018-09-01T00:03:48.075Z,US,12989.0,citizenlab-global,https://www.netflix.com/,False,,True,reserved,200.0,False,76249.0,{},,,True
4564,baseline-2018-09-01T000348.154035.json.bz2,2018-09-01T00:03:48.075Z,US,12989.0,citizenlab-global,http://warc.jalb.de/,,,False,sameip,-1.0,False,0.0,{},True,ICMP admin prohibition,True
4884,baseline-2018-09-01T000348.154035.json.bz2,2018-09-01T00:03:48.075Z,US,12989.0,citizenlab-global,http://bittornado.com/,,,False,sameip,200.0,False,13930.0,{},True,ICMP admin prohibition,True
5718,baseline-2018-09-01T000348.154035.json.bz2,2018-09-01T00:03:48.075Z,US,12989.0,citizenlab-global,http://netflix.com/,False,,True,reserved,200.0,False,71719.0,{},,,True
5804,baseline-2018-09-01T000348.154035.json.bz2,2018-09-01T00:03:48.075Z,US,12989.0,citizenlab-global,http://anonymouse.org/,,,False,sameip,200.0,False,3560.0,{},True,ICMP admin prohibition,True


In [6]:
censored_non_200_us = iclab.loc[(iclab['censored_updated'] == True) &
          (iclab['country'] == 'US') &
          (iclab['http_status'] != 200)
          & (iclab['http_status'] != -1)
         ].dropna(subset = ['http_status'])
censored_non_200_us.head()

Unnamed: 0,filename,server_t,country,as_number,schedule_name,url,dns,dns_reason,dns_all,dns_reason_all,http_status,block,body_len,http_reason,packet_updated,packet_reason,censored_updated
6080,baseline-2018-09-01T000348.154035.json.bz2,2018-09-01T00:03:48.075Z,US,12989.0,citizenlab-global,http://milanuncios.com/,,,-2,no_control_resp,456.0,True,755.0,HTTP body contains [\\u003ch1\\u003eAccess To ...,,,True
29273,baseline-2018-09-01T045208.307428.json.bz2,2018-09-01T04:52:08.236Z,US,12989.0,citizenlab-global,http://milanuncios.com/,,,-2,no_control_resp,456.0,True,755.0,HTTP body contains [\\u003ch1\\u003eAccess To ...,,,True
71991,baseline-2018-09-01T170123.077746.json.bz2,2018-09-01T17:01:23.023Z,US,12989.0,citizenlab-global,http://milanuncios.com/,,,-2,no_control_resp,456.0,True,755.0,HTTP body contains [\\u003ch1\\u003eAccess To ...,,,True
104315,baseline-2018-09-02T022742.763006.json.bz2,2018-09-02T02:27:42.683Z,US,12989.0,citizenlab-global,http://milanuncios.com/,,,-2,no_control_resp,456.0,True,755.0,HTTP body contains [\\u003ch1\\u003eAccess To ...,,,True
108607,baseline-2018-09-02T032809.990945.json.bz2,2018-09-02T03:28:09.926Z,US,198605.0,citizenlab-global,http://www.schwarzreport.org/,False,sameip,false,sameip,403.0,True,6863.0,HTTP body contains [\u003ctitle\u003eWeb Site ...,,,True


In [7]:
len(censored_non_200_us)

36

In [3]:
non_200_http = iclab.loc[(iclab['http_status'] != 200)].dropna(subset = ['http_status'])

In [4]:
non_200_us = non_200_http.loc[non_200_http['country'] == 'US'].url.unique()
non_200_us[:20]

array(['http://kinox.to/', 'http://movie4k.to/',
       'http://www.911truth.org/', 'http://adnetworkperformance.com/',
       'http://torah.org/', 'http://www.socom.mil/', 'http://ouo.io/',
       'http://www.suicidepreventionlifeline.org/',
       'http://www.democracycaucus.net/',
       'http://fatosdesconhecidos.com.br/', 'http://www.iccwomen.org/',
       'http://www.avert.org/', 'http://www.islamicity.com/',
       'http://www.cites.org/', 'http://amphetamines.com/',
       'http://www.serials.ws/', 'https://www.serials.ws/',
       'https://medpot.net/', 'http://www.hitler.org/',
       'http://www.bahai.org/'], dtype=object)

In [5]:
len(non_200_us)

2747

HTTP Status Code Info
- Informational responses (100–199)
- Successful responses (200–299)
- Redirects (300–399)
- Client errors (400–499)
- Server errors (500–599)

In [8]:
# types of error codes
iclab['http_status'].unique()

array([403.,  nan, 200.,  -1., 406., 401., 404., 500., 503., 400., 405.,
       521., 479., 429., 418., 409., 410., 504., 456., 544., 204., 451.,
       301., 523., 502., 412., 424., 530., 520., 999., 203., 408., 206.,
         0., 526., 508., 525., 477.])

In [10]:
gov_blocked = iclab.loc[(iclab['http_status'] == 451)]
gov_blocked.head()

Unnamed: 0,filename,server_t,country,as_number,schedule_name,url,dns,dns_reason,dns_all,dns_reason_all,http_status,block,body_len,http_reason,packet_updated,packet_reason,censored_updated
15128,baseline-2018-09-01T012755.940830.json.bz2,2018-09-01T01:27:55.773Z,CZ,60068.0,citizenlab-global,http://lifebuzz.com/,False,,False,,451.0,False,339.0,{},,,False
26772,baseline-2018-09-01T042033.228432.json.bz2,2018-09-01T04:20:32.939Z,RU,43317.0,country-sensitive-ru,http://putinbog.wordpress.com/,False,,False,,451.0,True,1833.0,HTTP body contains [http://zapret-info.gov.ru/],,,True
26979,baseline-2018-09-01T042033.228432.json.bz2,2018-09-01T04:20:32.939Z,RU,43317.0,country-sensitive-ru,http://haamash.wordpress.com/,False,,False,,451.0,True,1833.0,HTTP body contains [http://zapret-info.gov.ru/],,,True
37206,baseline-2018-09-01T084135.726037.json.bz2,2018-09-01T08:41:35.582Z,BG,59564.0,citizenlab-global,http://lifebuzz.com/,False,,False,,451.0,False,339.0,{},,,False
56755,baseline-2018-09-01T135130.512703.json.bz2,2018-09-01T13:51:30.306Z,UA,59564.0,country-sensitive-ua,https://lotoru.com/,False,,False,,451.0,False,52559.0,{},,,False


In [14]:
unique_alpha2 = gov_blocked['country'].unique()
print([pycountry.countries.get(alpha_2=i).name for i in unique_alpha2])

['Czechia', 'Russian Federation', 'Bulgaria', 'Ukraine', 'Lithuania', 'Netherlands', 'Luxembourg', 'Spain', 'Hungary', 'Turkey', 'Sweden', 'Finland', 'Portugal', 'Denmark', 'Slovakia', 'Romania']


In [21]:
# find rows with status codes that are NOT 200, -1, nan, 0, 999
anomaly_status_codes_us = iclab.loc[#(iclab['censored_updated'] == True)
          (iclab['country'] == 'US')
          & (iclab['http_status'] != 200)
          & (iclab['http_status'] != -1)
          & (iclab['http_status'] != 0)
          & (iclab['http_status'] != 999)                          
         ].dropna(subset = ['http_status'])
anomaly_status_codes_us.head()

Unnamed: 0,filename,server_t,country,as_number,schedule_name,url,dns,dns_reason,dns_all,dns_reason_all,http_status,block,body_len,http_reason,packet_updated,packet_reason,censored_updated
0,baseline-2018-09-01T000154.371069.json.bz2,2018-09-01T00:01:54.354Z,US,1249.0,country-sensitive-at,http://kinox.to/,,,,,403.0,False,3094.0,{},,,False
1,baseline-2018-09-01T000154.371069.json.bz2,2018-09-01T00:01:54.354Z,US,1249.0,country-sensitive-at,http://movie4k.to/,,,,,403.0,False,3098.0,{},,,False
4358,baseline-2018-09-01T000348.154035.json.bz2,2018-09-01T00:03:48.075Z,US,12989.0,citizenlab-global,http://www.911truth.org/,,,false,sameip,406.0,False,300.0,{},,,False
4361,baseline-2018-09-01T000348.154035.json.bz2,2018-09-01T00:03:48.075Z,US,12989.0,citizenlab-global,http://adnetworkperformance.com/,,,-2,no_control_resp,403.0,False,568.0,{},,,False
4370,baseline-2018-09-01T000348.154035.json.bz2,2018-09-01T00:03:48.075Z,US,12989.0,citizenlab-global,http://torah.org/,,,false,sameip,403.0,False,3095.0,{},,,False


In [22]:
len(anomaly_status_codes_us)

37868

## Websites Blocked in All Countries

In [8]:
# check on websites blocked in every country
# use country, url, censored_updated columns
filtered_iclab = iclab[['country', 'url', 'censored_updated']]
filtered_iclab.head()

Unnamed: 0,country,url,censored_updated
0,US,http://kinox.to/,False
1,US,http://movie4k.to/,False
2,KR,4shared.com,False
3,KR,news.bbc.co.uk,False
4,KR,ngt.jinbo.net,False


In [9]:
filtered_iclab['domain'] = [tldextract.extract(x).domain for x in filtered_iclab['url']]
filtered_iclab['suffix'] = [tldextract.extract(x).suffix for x in filtered_iclab['url']]

filtered_iclab.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,country,url,censored_updated,domain,suffix
0,US,http://kinox.to/,False,kinox,to
1,US,http://movie4k.to/,False,movie4k,to
2,KR,4shared.com,False,4shared,com
3,KR,news.bbc.co.uk,False,bbc,co.uk
4,KR,ngt.jinbo.net,False,jinbo,net


In [6]:
filtered_iclab_censored = filtered_iclab.loc[filtered_iclab['censored_updated'] == True]

filtered_iclab_censored.head()

Unnamed: 0,country,url,censored_updated,domain,suffix
823,KR,http://redtube.com/,True,redtube,com
870,KR,http://warc.jalb.de/,True,jalb,de
985,KR,http://backpage.com/,True,backpage,com
997,KR,http://pridetube.com/,True,pridetube,com
1030,KR,http://xvideos.com/,True,xvideos,com


In [7]:
filtered_iclab_censored['combined_site'] = filtered_iclab_censored['domain'] + '.' + filtered_iclab_censored['suffix']
filtered_iclab_censored.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,country,url,censored_updated,domain,suffix,combined_site
823,KR,http://redtube.com/,True,redtube,com,redtube.com
870,KR,http://warc.jalb.de/,True,jalb,de,jalb.de
985,KR,http://backpage.com/,True,backpage,com,backpage.com
997,KR,http://pridetube.com/,True,pridetube,com,pridetube.com
1030,KR,http://xvideos.com/,True,xvideos,com,xvideos.com


In [16]:
censored_counted = filtered_iclab_censored.groupby(by = ['combined_site', 'country']).count()

censored_counted.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,url,censored_updated,domain,suffix
combined_site,country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1004wing.com,KR,2,2,2,2
100india.com,IN,2,2,2,2
123music.mobi,IN,4,4,4,4
1337x.to,IN,1,1,1,1
17ok.com,AU,6,6,6,6


In [24]:
censored_counted_idx = censored_counted.reset_index()

In [25]:
censored_counted_idx.head()

Unnamed: 0,combined_site,country,url,censored_updated,domain,suffix
0,1004wing.com,KR,2,2,2,2
1,100india.com,IN,2,2,2,2
2,123music.mobi,IN,4,4,4,4
3,1337x.to,IN,1,1,1,1
4,17ok.com,AU,6,6,6,6


In [40]:
# Note: we can edit the condition, but NO sites are blocked by all countries

unique_combined_sites = censored_counted_idx['combined_site'].unique()
cc_blocking_country_margin = 10 # the margin via which a site is considered 'almost' blocked

count = 0
for site in unique_combined_sites:
    blocking_countries = censored_counted_idx.loc[censored_counted_idx['combined_site'] == site]['country'].unique()
    if abs(len(blocking_countries) - len(cc_unique)) < cc_blocking_country_margin:
        print(len(blocking_countries), len(cc_unique), set(cc_unique).difference(set(blocking_countries)))
        print(site)
        count += 1
        
print("Number of sites blocked by nearly all countries: " + str(count))

43 48 {'DZ', 'BZ', 'RO', 'ID', 'AT'}
17ok.com
42 48 {'DZ', 'CL', 'SC', 'RO', 'AT', 'CA'}
1905.com
42 48 {'DZ', 'CL', 'SC', 'RO', 'AT', 'CA'}
anonymouse.org
41 48 {'DZ', 'CL', 'SC', 'RO', 'ID', 'AT', 'CA'}
bittornado.com
40 48 {'DZ', 'RS', 'CL', 'BZ', 'SC', 'SG', 'RO', 'AT'}
globalvoicesonline.org
41 48 {'DZ', 'CL', 'BZ', 'SC', 'RO', 'AT', 'CA'}
hrcr.org
44 48 {'RO', 'DZ', 'SC', 'AT'}
jalb.de
41 48 {'CZ', 'DZ', 'BZ', 'SC', 'RO', 'PL', 'AT'}
schwarzreport.org
Number of sites blocked by nearly all countries: 8


## Filtering Sites Blocked in the US

In [10]:
# remove sites said to be blocked in the us, check how this affects the dataset

blocked_by_us = filtered_iclab.loc[(filtered_iclab['country'] == 'US') & (filtered_iclab['censored_updated'] == True)]

blocked_by_us.head()

Unnamed: 0,country,url,censored_updated,domain,suffix
4401,US,https://www.netflix.com/,True,netflix,com
4564,US,http://warc.jalb.de/,True,jalb,de
4884,US,http://bittornado.com/,True,bittornado,com
5718,US,http://netflix.com/,True,netflix,com
5804,US,http://anonymouse.org/,True,anonymouse,org


In [11]:
blocked_by_us['combined_site'] = blocked_by_us['domain'] + '.' + blocked_by_us['suffix']

blocked_by_us.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,country,url,censored_updated,domain,suffix,combined_site
4401,US,https://www.netflix.com/,True,netflix,com,netflix.com
4564,US,http://warc.jalb.de/,True,jalb,de,jalb.de
4884,US,http://bittornado.com/,True,bittornado,com,bittornado.com
5718,US,http://netflix.com/,True,netflix,com,netflix.com
5804,US,http://anonymouse.org/,True,anonymouse,org,anonymouse.org


In [12]:
combined_site_us = blocked_by_us['combined_site'].unique()

combined_site_us

array(['netflix.com', 'jalb.de', 'bittornado.com', 'anonymouse.org',
       'milanuncios.com', 'hrcr.org', 'globalvoicesonline.org',
       '17ok.com', '1905.com', 'schwarzreport.org', 'nato.int',
       'livedoor.com', 'yandex.ru', 'realstreaming.net'], dtype=object)

In [13]:
# remove the above blocked sites from the dataset -- how will this affect censored counts?

filtered_iclab['combined_site'] = filtered_iclab['domain'] + '.' + filtered_iclab['suffix']

filtered_iclab.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,country,url,censored_updated,domain,suffix,combined_site
0,US,http://kinox.to/,False,kinox,to,kinox.to
1,US,http://movie4k.to/,False,movie4k,to,movie4k.to
2,KR,4shared.com,False,4shared,com,4shared.com
3,KR,news.bbc.co.uk,False,bbc,co.uk,bbc.co.uk
4,KR,ngt.jinbo.net,False,jinbo,net,jinbo.net


In [14]:
drop_us_blocked = filtered_iclab.loc[np.logical_not(filtered_iclab['combined_site'].isin(combined_site_us))]

drop_us_blocked.head()

Unnamed: 0,country,url,censored_updated,domain,suffix,combined_site
0,US,http://kinox.to/,False,kinox,to,kinox.to
1,US,http://movie4k.to/,False,movie4k,to,movie4k.to
2,KR,4shared.com,False,4shared,com,4shared.com
3,KR,news.bbc.co.uk,False,bbc,co.uk,bbc.co.uk
4,KR,ngt.jinbo.net,False,jinbo,net,jinbo.net


In [35]:
num_sites_tested = len(drop_us_blocked['combined_site'].unique())

num_sites_tested

9706

In [16]:
filtered_drop_us_censored = drop_us_blocked.loc[drop_us_blocked['censored_updated'] == True]

filtered_drop_us_censored.head()

Unnamed: 0,country,url,censored_updated,domain,suffix,combined_site
823,KR,http://redtube.com/,True,redtube,com,redtube.com
985,KR,http://backpage.com/,True,backpage,com,backpage.com
997,KR,http://pridetube.com/,True,pridetube,com,pridetube.com
1030,KR,http://xvideos.com/,True,xvideos,com,xvideos.com
1089,KR,http://bongacams.com/,True,bongacams,com,bongacams.com


In [17]:
site_country_grouped = filtered_drop_us_censored.groupby(by = ['combined_site', 'country']).count()

site_country_grouped.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,url,censored_updated,domain,suffix
combined_site,country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1004wing.com,KR,2,2,2,2
100india.com,IN,2,2,2,2
123music.mobi,IN,4,4,4,4
1337x.to,IN,1,1,1,1
22beats.com,IN,4,4,4,4


In [27]:
# example entry -- KR: {CN: 3, IN: 2}

unique_sites = site_country_grouped.index.get_level_values('combined_site')
country_blocked_map = {}

dropped_us_common_blocked = pd.DataFrame(columns = ['Country 1', 'Country 2', 'Count'])
for u in unique_sites:
    curr_xs = site_country_grouped.xs(u)
    countries_in_xs = curr_xs.index.unique()
    curr_country_pairs = itertools.permutations(countries_in_xs, r = 2)
    for c in curr_country_pairs:
        if c in country_blocked_map:
            country_blocked_map[c] += 1
        else:
            country_blocked_map[c] = 1

print(country_blocked_map)

for k, v in country_blocked_map.items():
    c1, c2 = [pycountry.countries.get(alpha_2 = k[0]).name, pycountry.countries.get(alpha_2 = k[1]).name]
    dropped_us_common_blocked = dropped_us_common_blocked.append({'Country 1': c1, 'Country 2': c2, 'Count': v}, ignore_index = True)
    
dropped_us_common_blocked.head()

{('CN', 'HK'): 2, ('HK', 'CN'): 2, ('KR', 'TR'): 32, ('TR', 'KR'): 32, ('CN', 'IN'): 10, ('CN', 'MY'): 10, ('CN', 'VN'): 10, ('CN', 'ZA'): 10, ('IN', 'CN'): 10, ('IN', 'MY'): 10, ('IN', 'VN'): 10, ('IN', 'ZA'): 10, ('MY', 'CN'): 10, ('MY', 'IN'): 10, ('MY', 'VN'): 10, ('MY', 'ZA'): 10, ('VN', 'CN'): 10, ('VN', 'IN'): 10, ('VN', 'MY'): 10, ('VN', 'ZA'): 10, ('ZA', 'CN'): 10, ('ZA', 'IN'): 10, ('ZA', 'MY'): 10, ('ZA', 'VN'): 10, ('JP', 'KR'): 2, ('KR', 'JP'): 2, ('BG', 'IN'): 4, ('BG', 'NL'): 4, ('BG', 'NZ'): 4, ('IN', 'BG'): 4, ('IN', 'NL'): 4, ('IN', 'NZ'): 4, ('NL', 'BG'): 4, ('NL', 'IN'): 4, ('NL', 'NZ'): 4, ('NZ', 'BG'): 4, ('NZ', 'IN'): 4, ('NZ', 'NL'): 4, ('HU', 'NZ'): 2, ('NZ', 'HU'): 2, ('CN', 'NL'): 2, ('NL', 'CN'): 2, ('IN', 'RU'): 3, ('IN', 'TR'): 3, ('RU', 'IN'): 3, ('RU', 'TR'): 3, ('TR', 'IN'): 3, ('TR', 'RU'): 3}


Unnamed: 0,Country 1,Country 2,Count
0,China,Hong Kong,2
1,Hong Kong,China,2
2,"Korea, Republic of",Turkey,32
3,Turkey,"Korea, Republic of",32
4,China,India,10


In [28]:
dropped_us_common_blocked.to_csv('dropped_us_blocked_intermediate_calculations.csv')

In [20]:
censored_count_drop_us = filtered_drop_us_censored.groupby(by = 'country').count()[['url']].reset_index()

censored_count_drop_us.head()

Unnamed: 0,country,url
0,AU,1
1,BG,9
2,CN,44
3,HK,1
4,HU,2


In [64]:
censored_count_drop_us['url'] = censored_count_drop_us['url'] / num_sites_tested

censored_count_drop_us

Unnamed: 0,country,url
0,AU,0.000103
1,BG,0.000927
2,CN,0.004533
3,HK,0.000103
4,HU,0.000206
5,IN,0.076345
6,JP,0.000309
7,KR,0.028745
8,MY,0.001236
9,NL,0.000721


In [29]:
dropped_us_common_blocked = pd.read_csv('dropped_us_blocked_intermediate_calculations.csv').drop(columns = ['Unnamed: 0'])

dropped_us_common_blocked.head()

Unnamed: 0,Country 1,Country 2,Count
0,China,Hong Kong,2
1,Hong Kong,China,2
2,"Korea, Republic of",Turkey,32
3,Turkey,"Korea, Republic of",32
4,China,India,10


In [41]:
cc_to_country = {}
for country in pycountry.countries:
    cc_to_country[country.alpha_2] = country.name

In [50]:
country_to_cc = {}
for country in pycountry.countries:
    country_to_cc[country.name] = country.alpha_3

In [31]:
for c1 in cc_unique:
    for c2 in cc_unique:
        if c1 == c2:
            continue
        
        c1_country = cc_to_country[c1]
        c2_country = cc_to_country[c2]
        
        in_dropped_us = dropped_us_common_blocked.loc[(dropped_us_common_blocked['Country 1'] == c1_country) & (dropped_us_common_blocked['Country 2'] == c2_country)]
        
        if not len(in_dropped_us):
            dropped_us_common_blocked = dropped_us_common_blocked.append({'Country 1': c1_country, 'Country 2': c2_country, 'Count': 0}, ignore_index = True)

dropped_us_common_blocked.head()

Unnamed: 0,Country 1,Country 2,Count
0,China,Hong Kong,2
1,Hong Kong,China,2
2,"Korea, Republic of",Turkey,32
3,Turkey,"Korea, Republic of",32
4,China,India,10


In [32]:
dropped_us_common_blocked.tail()

Unnamed: 0,Country 1,Country 2,Count
2251,Seychelles,Portugal,0
2252,Seychelles,Denmark,0
2253,Seychelles,Israel,0
2254,Seychelles,"Moldova, Republic of",0
2255,Seychelles,Austria,0


In [33]:
dropped_us_common_blocked.to_csv('dropped_us_blocked_intermediate_calculations.csv')

In [51]:
dropped_us_combined_similarities = {}

for _, r in dropped_us_common_blocked.iterrows():
    country1 = r['Country 1']
    country2 = r['Country 2']
    count = r['Count']
    
    dropped_us_combined_similarities["('" + country1 + "', '" + country_to_cc[country2] + "')"] = [{"similarity": count / num_sites_tested}]
    
dropped_us_combined_similarities

{"('China', 'HKG')": [{'similarity': 0.000206058108386565}],
 "('Hong Kong', 'CHN')": [{'similarity': 0.000206058108386565}],
 "('Korea, Republic of', 'TUR')": [{'similarity': 0.00329692973418504}],
 "('Turkey', 'KOR')": [{'similarity': 0.00329692973418504}],
 "('China', 'IND')": [{'similarity': 0.0010302905419328251}],
 "('China', 'MYS')": [{'similarity': 0.0010302905419328251}],
 "('China', 'VNM')": [{'similarity': 0.0010302905419328251}],
 "('China', 'ZAF')": [{'similarity': 0.0010302905419328251}],
 "('India', 'CHN')": [{'similarity': 0.0010302905419328251}],
 "('India', 'MYS')": [{'similarity': 0.0010302905419328251}],
 "('India', 'VNM')": [{'similarity': 0.0010302905419328251}],
 "('India', 'ZAF')": [{'similarity': 0.0010302905419328251}],
 "('Malaysia', 'CHN')": [{'similarity': 0.0010302905419328251}],
 "('Malaysia', 'IND')": [{'similarity': 0.0010302905419328251}],
 "('Malaysia', 'VNM')": [{'similarity': 0.0010302905419328251}],
 "('Malaysia', 'ZAF')": [{'similarity': 0.0010302

In [52]:
with open('dropped_us_combined_similarities.json', 'w', encoding='utf-8') as f:
    json.dump(dropped_us_combined_similarities, f, ensure_ascii=False, indent=4)