## Fetch GDELT Data

In [1]:
import requests
import lxml.html as lh
import os
import zipfile
import glob
import operator
import pandas as pd

### 1. Identify 2017 file links from the data source website

In [2]:
year_to_analyze = '2017'
# fetch all file names from source website
gdelt_data_links = 'http://data.gdeltproject.org/events/'

response = requests.get(gdelt_data_links + 'index.html')
html_content = lh.fromstring(response.content)
all_links = html_content.xpath("//*/ul/li/a/@href")

# identify 2017 files
file_list = [x for x in all_links if x[0:4]==year_to_analyze]
print('Total 2017 files identified:', len(file_list))
print('Sample of file names:', file_list[:3])

Total 2017 files identified: 107
Sample of file names: ['20170417.export.CSV.zip', '20170416.export.CSV.zip', '20170415.export.CSV.zip']


### 2. Fetch Data and Filter data for specific countries

In [3]:
pwd = os.getcwd()
data_dir = pwd + '/../data/'
us_code = 'US'
countries_interested_codes = set(['UK', 'CA', 'CH', 'MX', 'IN'])


#### 2.a Download the 2017 zip files

In [4]:
for file_name in file_list:
    file_to_save = data_dir + 'compressed/' + file_name
    # Avoid downloading if already exists
    if not os.path.isfile(file_to_save): 
        response = requests.get(gdelt_data_links + file_name)
        with open(file_to_save, 'wb') as f:
            f.write(response.content) 
print('Compressed zip files are saved.')            

#### 2.b Extract zip files and filter data for specific countries

In [5]:
for zip_name in glob.glob(data_dir + 'compressed/*.zip'):
    #print('extracting:', zip_name)
    z = zipfile.ZipFile(file = zip_name, mode = 'r')    
    z.extractall(path = data_dir + 'extracted/')
    # open the extracted file and filter lines
    csv_file_name = zip_name.replace('compressed', 'extracted')[:-4]
    filtered_file_name = csv_file_name[:-7] + 'filtered.csv'
    with open(csv_file_name, mode='r') as extracted_file, open(filtered_file_name, mode='w') as filtered_file:
        #print('using:', extracted_file, 'writing to:', filtered_file)
        for line in extracted_file:
            # extract lines with our interest country code
            countries_involved = operator.itemgetter(37, 44, 51)(line.split('\t'))
            if us_code in countries_involved:  
                other_country_code = set(countries_involved).intersection(countries_interested_codes)
                if other_country_code:
                    filtered_file.write(line)
                                        

    # delete the temporary extracted file
    os.remove(csv_file_name)
    
print('Filtered files are saved in dir:', data_dir + 'extracted/')

Filtered files are saved in dir: /Users/tanya/Documents/PARITOSH/pythonStuff/workingCopy/PythonDataAnalysis/final/analysis/../data/extracted/


#### 2.c Convert the File contents to Dataframes to allow flexible operations 

In [6]:
# Get column headers from the helper file (under /extra dir)
col_headers = pd.read_excel(data_dir + '../extra/CSV.header.fieldids.xlsx', sheetname='Sheet1', 
                         index_col='Column ID', parse_cols=1)['Field Name']
filtered_csv_files = glob.glob(data_dir + 'extracted/*')
dfs = []
for current_file in filtered_csv_files:
    dfs.append(pd.read_csv(current_file, sep='\t', header=None, dtype=str,
                              names=col_headers, index_col=['GLOBALEVENTID']))

combined_df = pd.concat(dfs)
combined_df.to_pickle(data_dir + 'pickled/gdelt.pickle')    
print('Combined pickled dataframe is saved at:', data_dir + 'pickled/gdelt.pickle')

Combined pickled dataframe is saved at: /Users/tanya/Documents/PARITOSH/pythonStuff/workingCopy/PythonDataAnalysis/final/analysis/../data/pickled/gdelt.pickle
