In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

# read data in 
issues = pd.read_csv('../data/issues.csv', index_col = 0)
roll_calls = pd.read_csv('../data/roll_calls.csv', index_col = 0, na_values=['NA', ''])
unvotes = pd.read_csv('../data/unvotes.csv', index_col = 0)

# print(issues.head(2))
print(issues.index) #'rcid' column
issues = issues.reset_index(drop=False)

# print(roll_calls.head(2))
print(roll_calls.index) #'rcid' column
roll_calls = roll_calls.reset_index(drop=False)

#print(unvotes.head(2))
print(unvotes.index) #'rcid' column
unvotes = unvotes.reset_index(drop=False)

Index([  77, 9001, 9002, 9003, 9004, 9005, 9006,  128,  129,  130,
       ...
       9127, 9128, 9135, 9142, 9136, 9145, 9069, 9074, 9095, 9092],
      dtype='int64', name='rcid', length=5745)
Index([   3,    4,    5,    6,    7,    8,    9,   10,   11,   12,
       ...
       9138, 9139, 9140, 9141, 9142, 9143, 9144, 9145, 9146, 9147],
      dtype='int64', name='rcid', length=6202)
Index([   3,    3,    3,    3,    3,    3,    3,    3,    3,    3,
       ...
       9101, 9101, 9101, 9101, 9101, 9101, 9101, 9101, 9101, 9101],
      dtype='int64', name='rcid', length=869937)


In [2]:
#data checks: issues

"""
rcid is a specific vote session id. 
this table maps the specific topic discussed to a specific vote session id.

note: 
- me: palestinian conflict
- nu: nuclear weapons & nuclear material
- di: arms control & disarmament
- hr: human rights
- co: colonialism
- ec: economic development 

"""

# print("shape: " + str(issues.shape)) #(5745, 3)
# print(issues.describe())
# print(issues.dtypes)
# print(issues.isna().sum()) #no na values
# print(issues.duplicated(subset=['rcid', 'short_name']).sum()) # on rcid and short_name, no duplicates
# print(issues['short_name'].unique()) #['me' 'nu' 'di' 'hr' 'co' 'ec']

map_issues = {'me': 'Palestine', 'nu': 'Nuclear', 'di': 'Arms', 'hr': 'Human Rights', 'co': 'Colonialism', 'ec': 'Economic Development'}
issues['short_name'] = issues['short_name'].replace(map_issues)
issues = issues.drop(columns=['issue'])
print(issues.head())

   rcid short_name
0    77  Palestine
1  9001  Palestine
2  9002  Palestine
3  9003  Palestine
4  9004  Palestine


In [3]:
#data checks: unvotes

"""
this table describes how each country voted in each voting session. 
"""

# print("shape: " + str(unvotes.shape)) #(869937, 4)
# print(unvotes.describe())
# print(unvotes.dtypes) 
# print(unvotes.isna().sum()) #country code has 7898 missing values 

missing_country = unvotes[unvotes['country_code'].isna()]

unvotes['country_code'] = np.where(unvotes['country'] == 'Czechoslovakia', 'CS', unvotes['country_code'])
unvotes['country_code'] = np.where(unvotes['country'] == 'Yugoslavia', 'YU', unvotes['country_code'])
unvotes['country_code'] = np.where(unvotes['country'] == 'German Democratic Republic', 'DD', unvotes['country_code'])
unvotes['country_code'] = np.where(unvotes['country'] == "Yemen People's Republic", 'YD', unvotes['country_code'])

unvotes['country'] = np.where(
    (unvotes['country'].notna()) & (unvotes['country'] == "German Federal Republic"),
    "Federal Republic of Germany",
    unvotes['country']
)

# print(missing_country['country'].unique()) #['Yemen Arab Republic' 'Zanzibar' 'Federal Republic of Germany' 'Namibia']
mapping = {'Yemen Arab Republic': 'YAR', 'Zanzibar': 'ZAN', 'Federal Republic of Germany': 'GER', 'Namibia': 'NAM'}
unvotes['country_code'] = unvotes['country_code'].fillna(unvotes['country'].map(mapping))
print(unvotes.head(1))

# print(unvotes.isna().sum()) #now country code has no missing values
# print(unvotes.duplicated(subset=['rcid', 'country_code']).sum()) # on rcid and country_code, no duplicates

   rcid        country country_code  vote
0     3  United States           US     1


  unvotes['vote'] = unvotes['vote'].replace({'yes': 1, 'no': 2, 'abstain': 3})


In [4]:
#data checks: roll_calls 

"""
this table describes the details of each vote: date, resolution number, descriptions of what the vote is about.
"""

# print(roll_calls.head(1))
# print("shape: " + str(roll_calls.shape)) #(6202, 9)
# print(roll_calls.describe()) #5598/6164 important votes (of those assessed); 2868 amendment votes, 3208 paragraph votes
# print(roll_calls.duplicated(subset=['rcid']).sum()) #no duplicates
# print(roll_calls.isna().sum()) 

"""
missing values: 
importantvote     604
unres             159
amend            3334
para             2994
short             573
descr               1
"""

# print(roll_calls.dtypes) #convert date to DateTime; importantvote, amend, para left as int/float
roll_calls['date'] = pd.to_datetime(roll_calls['date'], format='%Y-%m-%d')
# print(roll_calls.dtypes)

pre_1985 = roll_calls[roll_calls['date'].between('1946-01-01', '1984-12-31')].sort_values('date', ascending=True)
# print(pre_1985.shape[0]) #2844
#print(pre_1985.isna().sum()) #all cleaned except for 151 missing entries for unres

""" 
missing values:
rcid               0
session            0
importantvote      0
date               0
unres            151
amend              0
para               0
short              0
descr              0
"""

post_1985 = roll_calls[roll_calls['date'].between('1985-01-01', '2020-01-01')].sort_values('date', ascending=True)
post_1985 = post_1985.drop(columns=['amend', 'para'])
# print(post_1985.head(1))
#print(post_1985.isna().sum()) #missing entries: 604 importantvote, 8 unres, 573 short, 1 descr

"""
rcid               0
session            0
importantvote    604
date               0
unres              8
short            573
descr              1
"""

pre_no_amend_para = pre_1985[['rcid', 'session', 'importantvote', 'date', 'unres', 'short', 'descr']]
roll_calls_no_amend_para = pd.concat([pre_no_amend_para, post_1985], axis=0)
# print(roll_calls_no_amend_para.shape) #(6202,7) 
# print(roll_calls_no_amend_para.isna().sum()) #missing entries: 604 importantvote, 159 unres, 573 short, 1 descr

At present, we have the following tables: 

issues: --> what kinds of issues are present
- rcid(int): voting number
- short_name(str): type of issue 

unvotes: --> how each country voted on each vote
- rcid(int): voting number
- country(str): country name
- country_code(str): short name of country (2-3 characters)
- vote(str): [yes, no, abstain]

from the roll calls table we have 3 tables: pre_1985, post_1985, roll_calls_no_amend_para
- amend and para were depreciated from 1985. pre_1985 contains them, post_1985 does not. 
- roll_calls_no_amend_para is the roll_calls table without those 2 columns 

- rcid(int): voting number
- session(int): session number, one number can have a lot of votes
- importantvote(int): takes values 0, 1 or NA
- date(DateTime): date
- unres(str): voting code. has missing values. 
- amend(str): whether vote is on amendment.
- para(str): whether vote is on a paragraph
- short(str): short description
- descr(str): long description