In [31]:
import pandas as pd
import numpy as np
import os
import pickle
import csv

## data folder has the following structure

1. gold-standard
2. metadata
    - matching
        - 01.pkl
        - 02.pkl
        .
        .
    - variables
3. policy
4. reference
5. texts
    - 01
        - file.pkl
        - file2.pkl
        .
        .
    - 02
    - 03
    ..
6. urls

### gold-standard

In [2]:
filepath = 'data/gold-standard/gold_standard.csv'
gs = pd.read_csv(filepath)
gs.head()

Unnamed: 0,ids,month,class,title
0,2055,8,positive,News Alert! Maratha Reservation: Bicycles set ...
1,5909,8,positive,"Post-rains, plantation sector tots up losses"
2,2525,8,positive,Can't free Rajiv Gandhi's killers: Indian Govt...
3,3175,8,positive,Organic farming policy on the anvil
4,1164,8,positive,MP: 3 detained after 10 dead cows found in aba...


### metadata

#### matching

In [15]:
filepath = 'data/metadata/matching/01.pkl'
matching = pickle.load(open(filepath, "rb"))

In [16]:
print(type(matching))

<class 'dict'>


In [19]:
matching[153]

[566, 567, 568, 570, 571]

#### variables

In [22]:
filepath = 'data/metadata/variables/02.csv'
variables = pd.read_csv(filepath)
variables.head(1)

Unnamed: 0.2,Unnamed: 0,index,Unnamed: 0.1,GLOBALEVENTID,SQLDATE,MonthYear,Year,FractionDate,Actor1Code,Actor1Name,...,ActionGeo_CountryCode,ActionGeo_ADM1Code,ActionGeo_ADM2Code,ActionGeo_Lat,ActionGeo_Long,ActionGeo_FeatureID,DATEADDED,SOURCEURL,to_scrape,title
0,0,4,517,727140561,20180201,201802,2018,2018.0849,AGR,FARMER,...,IN,IN02,17568.0,18.0,79.5,-2112824,20180201003000,http://www.tribuneindia.com/news/comment/will-...,http://www.tribuneindia.com/news/comment/will-...,will govt ensure fair prices to farmers


In [24]:
variables.columns

Index(['Unnamed: 0', 'index', 'Unnamed: 0.1', 'GLOBALEVENTID', 'SQLDATE',
       'MonthYear', 'Year', 'FractionDate', 'Actor1Code', 'Actor1Name',
       'Actor1CountryCode', 'Actor1KnownGroupCode', 'Actor1EthnicCode',
       'Actor1Religion1Code', 'Actor1Religion2Code', 'Actor1Type1Code',
       'Actor1Type2Code', 'Actor1Type3Code', 'Actor2Code', 'Actor2Name',
       'Actor2CountryCode', 'Actor2KnownGroupCode', 'Actor2EthnicCode',
       'Actor2Religion1Code', 'Actor2Religion2Code', 'Actor2Type1Code',
       'Actor2Type2Code', 'Actor2Type3Code', 'IsRootEvent', 'EventCode',
       'CAMEOCodeDescription', 'EventBaseCode', 'EventRootCode', 'QuadClass',
       'GoldsteinScale', 'NumMentions', 'NumSources', 'NumArticles', 'AvgTone',
       'Actor1Geo_Type', 'Actor1Geo_FullName', 'Actor1Geo_CountryCode',
       'Actor1Geo_ADM1Code', 'Actor1Geo_ADM2Code', 'Actor1Geo_Lat',
       'Actor1Geo_Long', 'Actor1Geo_FeatureID', 'Actor2Geo_Type',
       'Actor2Geo_FullName', 'Actor2Geo_CountryCode', 'A

In [28]:
variables['Actor1Code'].value_counts()

IND       3862
GOV       3132
COP       1522
CVL        958
EDU        852
          ... 
PAKOPP       1
EURGOV       1
HLHLAB       1
FINBUS       1
MLT          1
Name: Actor1Code, Length: 472, dtype: int64

### Policy 

In [32]:
# Left for now

### reference

In [34]:
filepath = 'data/reference/acled_india_2017.csv'
acled = pd.read_csv(filepath)
acled.head(2)

Unnamed: 0,data_id,iso,event_id_cnty,event_id_no_cnty,event_date,year,time_precision,event_type,sub_event_type,actor1,...,location,latitude,longitude,geo_precision,source,source_scale,notes,fatalities,timestamp,iso3
0,5478505,356,IND25478,25478,31 December 2017,2017,1,Protests,Peaceful protest,Protesters (India),...,Agartala,23.833,91.2864,1,Assam Tribune,Subnational,CPI(M) staged an election rally in Agartala on...,0,1561474143,IND
1,5478506,356,IND25479,25479,31 December 2017,2017,1,Protests,Peaceful protest,Protesters (India),...,Bhubaneswar,20.2724,85.8338,1,Times of India,National,"On December 31, in Bhubaneswar, the youth and ...",0,1561474143,IND


In [35]:
acled.columns

Index(['data_id', 'iso', 'event_id_cnty', 'event_id_no_cnty', 'event_date',
       'year', 'time_precision', 'event_type', 'sub_event_type', 'actor1',
       'assoc_actor_1', 'inter1', 'actor2', 'assoc_actor_2', 'inter2',
       'interaction', 'region', 'country', 'admin1', 'admin2', 'admin3',
       'location', 'latitude', 'longitude', 'geo_precision', 'source',
       'source_scale', 'notes', 'fatalities', 'timestamp', 'iso3'],
      dtype='object')

In [42]:
acled['notes'][3]

'25 villagers of Gohri Mafi situated on the outskirts of Rajaji Tiger Reserve in Dehradun district, Uttarakhand launched an ongoing sit-in protest assumed on Dec 31, demanding a flood control wall to protect their village from the Song river.'

### texts 

In [66]:
# Texts basically contains all the news article

In [47]:
from newsplease import NewsPlease

In [48]:
filepath = 'data/texts/08/02733.pkl'
text = pickle.load(open(filepath, 'rb'))

In [65]:
text.get_dict

<bound method NewsArticle.get_dict of <NewsArticle.NewsArticle object at 0x1288f5f60>>

In [63]:
print('authors - ',text.authors)
print('date_download - ',text.date_download)
print('date_modify - ' ,text.date_modify)
print('date_publish - ' ,text.date_publish)
print('description - ' ,text.description)
# print(text.end)
# print(text.file)
print('filename - ' ,text.filename)
# print(text.flush)
print('get_dict - ' , text.get_dict)
print('get_serializable_dict - ' , text.get_serializable_dict)
print('image_url - ' , text.image_url)
print('language - ',text.language)
print('localpath - ',text.localpath)
# print(text.sep)
print('source_domain - ',text.source_domain)
print('text - ',text.text)
print('title - ',text.title)
print('title_page - ',text.title_page)
print('title_rss - ',text.title_rss)
print('url - ',text.url)

authors -  []
date_download -  2019-08-30 11:35:29
date_modify -  None
date_publish -  2018-08-11 18:24:06
description -  Read more about Ericsson opens technical training skill centre for youth in UP on Business Standard. Aiming to train the youth in mobile hardware repair and technical training in computers and peripherals, Swedish telecommunications company Ericsson on Saturday opened a skill centre in partnership with Electronics Sector Skills Council of India
filename -  https%3A%2F%2Fwww.business-standard.com%2Farticle%2Fnews-ians%2Fericsson-opens-technical-training-skill-centre-for-youth-in-up-118081100728_1.html.json
get_dict -  <bound method NewsArticle.get_dict of <NewsArticle.NewsArticle object at 0x1288f5f60>>
get_serializable_dict -  <bound method NewsArticle.get_serializable_dict of <NewsArticle.NewsArticle object at 0x1288f5f60>>
image_url -  https://bsmedia.business-standard.com/_media/bs/img/article/default/1180811/full-118081100728.jpg
language -  en
localpath -  None

In [56]:
# One way to get articles corresponding to gold-standards are

def load_obj(month, idx):
    month = str(month).zfill(2)
    idx = str(idx).zfill(5)
    with open("data/texts/{}/{}.pkl".format(month, idx), "rb") as f:
        return pickle.load(f)
    
def load_dict(month):
    month = str(month).zfill(2)
    with open("data/metadata/matching/{}.pkl".format(month), "rb") as f:
        return pickle.load(f)
    
gs = pd.read_csv('data/gold-standard/gold_standard.csv')

gs_articles = {}

for i in range(len(gs)):
    article = load_obj(gs['month'][i], gs['ids'][i])
    gs_articles[i] = article

In [57]:
# key - is the id of the articles
# value - is the article, which has attribute like - 
for key, value in gs_articles.items():
    print(key)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
