In [1]:
import pandas as pd
from utils import *
from mappings import *

In [2]:
df = read_dbf('../data/raw/exped.DBF')

In [3]:
df.head()

Unnamed: 0,expid,peakid,year,season,host,route1,route2,route3,route4,nation,...,accidents,achievment,agency,comrte,stdrte,primrte,primmem,primref,primid,chksum
0,ANN260101,ANN2,1960,1,1,NW Ridge-W Ridge,,,,UK,...,,,,,,False,False,,,2442047
1,ANN269301,ANN2,1969,3,1,NW Ridge-W Ridge,,,,Yugoslavia,...,Draslar frostbitten hands and feet,,,,,False,False,,,2445501
2,ANN273101,ANN2,1973,1,1,W Ridge-N Face,,,,Japan,...,,,,,,False,False,,,2446797
3,ANN278301,ANN2,1978,3,1,N Face-W Ridge,,,,UK,...,,,,,,False,False,,,2448822
4,ANN279301,ANN2,1979,3,1,N Face-W Ridge,NW Ridge of A-IV,,,UK,...,,,,,,False,False,,,2449204


In [4]:
# merge expedid and year to form true primary key
df.expid = df.expid.str.cat(df.year, sep='_')

In [5]:
# assert primary key requirements
assert df.expid.isna().sum() == 0
assert df.expid.nunique() == df.shape[0]

In [6]:
df.head()

Unnamed: 0,expid,peakid,year,season,host,route1,route2,route3,route4,nation,...,accidents,achievment,agency,comrte,stdrte,primrte,primmem,primref,primid,chksum
0,ANN260101_1960,ANN2,1960,1,1,NW Ridge-W Ridge,,,,UK,...,,,,,,False,False,,,2442047
1,ANN269301_1969,ANN2,1969,3,1,NW Ridge-W Ridge,,,,Yugoslavia,...,Draslar frostbitten hands and feet,,,,,False,False,,,2445501
2,ANN273101_1973,ANN2,1973,1,1,W Ridge-N Face,,,,Japan,...,,,,,,False,False,,,2446797
3,ANN278301_1978,ANN2,1978,3,1,N Face-W Ridge,,,,UK,...,,,,,,False,False,,,2448822
4,ANN279301_1979,ANN2,1979,3,1,N Face-W Ridge,NW Ridge of A-IV,,,UK,...,,,,,,False,False,,,2449204


### Expedition Termination

In [7]:
df_exped_termination = pd.DataFrame.from_dict(exped_termination_map, orient='index', columns=['name']).reset_index(names='id')

In [8]:
df_exped_termination.head()

Unnamed: 0,id,name
0,0,Unknown
1,1,Success (main peak)
2,2,"Success (subpeak, foresummit)"
3,3,Success (claimed)
4,4,"Bad weather (storms, high winds)"


### Expedition Nations

In [9]:
# explode /-separated values into scalar values
df_exped_nations = df[['expid', 'nation']]
df_exped_nations.loc[:, 'nation'] = df_exped_nations.nation.str.split('/')
df_exped_nations = df_exped_nations.explode('nation')

In [10]:
# replace nation names with country ids
df_exped_nations = update_country_list(df_exped_nations, 'nation')

In [11]:
df_exped_nations.head()

Unnamed: 0,expid,nation_id
0,ANN260101_1960,3
1,ANN269301_1969,4
2,ANN273101_1973,5
3,ANN278301_1978,3
4,ANN279301_1979,3


In [12]:
df.drop('nation', axis=1, inplace=True)

### Expedition Hosts

In [13]:
df.host = apply_map(df.host, host_map)
df = update_country_list(df, 'host')

In [15]:
df.head()

Unnamed: 0,expid,peakid,year,season,route1,route2,route3,route4,leaders,sponsor,...,achievment,agency,comrte,stdrte,primrte,primmem,primref,primid,chksum,host_id
0,ANN260101_1960,ANN2,1960,1,NW Ridge-W Ridge,,,,J. O. M. Roberts,,...,,,,,False,False,,,2442047,0
1,ANN269301_1969,ANN2,1969,3,NW Ridge-W Ridge,,,,Ales Kunaver,Mountaineering Club of Slovenia,...,,,,,False,False,,,2445501,0
2,ANN273101_1973,ANN2,1973,1,W Ridge-N Face,,,,Yukio Shimamura,Sangaku Doshikai Annapurna II Expedition 1973,...,,,,,False,False,,,2446797,0
3,ANN278301_1978,ANN2,1978,3,N Face-W Ridge,,,,Richard J. Isherwood,British Annapurna II Expedition,...,,,,,False,False,,,2448822,0
4,ANN279301_1979,ANN2,1979,3,N Face-W Ridge,NW Ridge of A-IV,,,Paul Moores,,...,,,,,False,False,,,2449204,0


### Expedition Countries

In [16]:
df_exped_countries = df[['expid', 'countries']].dropna()

# split comma-separated values into scalar values
df_exped_countries.loc[:, 'countries'] = df_exped_countries.countries.str.split(',')
df_exped_countries = df_exped_countries.explode('countries')

# split slash-separated values into scalar values
df_exped_countries.loc[:, 'countries'] = df_exped_countries.countries.str.split('/')
df_exped_countries = df_exped_countries.explode('countries')

# clean up individual country names
df_exped_countries.countries = df_exped_countries.countries.str.strip()

In [17]:
df_exped_countries = update_country_list(df_exped_countries, 'countries')

In [18]:
df.drop('countries', axis=1, inplace=True)

### Leaders

In [19]:
df_exped_leaders = df[['expid', 'leaders']]
df_exped_leaders.loc[:, 'leaders'] = df_exped_leaders.leaders.str.split(',')
df_exped_leaders = df_exped_leaders.explode('leaders')
df_exped_leaders.leaders = df_exped_leaders.leaders.str.strip()

In [20]:
df.drop('leaders', axis=1, inplace=True)

In [21]:
df_exped_leaders.head()

Unnamed: 0,expid,leaders
0,ANN260101_1960,J. O. M. Roberts
1,ANN269301_1969,Ales Kunaver
2,ANN273101_1973,Yukio Shimamura
3,ANN278301_1978,Richard J. Isherwood
4,ANN279301_1979,Paul Moores


### Routes

In [22]:
df_route_1 = df.loc[
	df.route1.notna() | df.success1.notna() | df.ascent1.notna(),
	['expid', 'route1', 'success1', 'ascent1']]\
	.rename({'route1': 'route', 'success1': 'success', 'ascent1': 'ascent'}, axis=1)
df_route_1['number'] = 1

In [23]:
df_route_2 = df.loc[
	df.route2.notna() | df.success2.notna() | df.ascent2.notna(),
	['expid', 'route2', 'success2', 'ascent2']]\
	.rename({'route2': 'route', 'success2': 'success', 'ascent2': 'ascent'}, axis=1)
df_route_2['number'] = 2

In [24]:
df_route_3 = df.loc[
	df.route3.notna() | df.success3.notna() | df.ascent3.notna(),
	['expid', 'route3', 'success3', 'ascent3']]\
	.rename({'route3': 'route', 'success3': 'success', 'ascent3': 'ascent'}, axis=1)
df_route_3['number'] = 3

In [25]:
df_route_4 = df.loc[
	df.route4.notna() | df.success4.notna() | df.ascent4.notna(),
	['expid', 'route4', 'success4', 'ascent4']]\
	.rename({'route4': 'route', 'success4': 'success', 'ascent4': 'ascent'}, axis=1)
df_route_4['number'] = 4

In [26]:
df_routes = pd.concat([df_route_1, df_route_2, df_route_3, df_route_4], ignore_index=True)

In [27]:
df_routes.head()

Unnamed: 0,expid,route,success,ascent,number
0,ANN260101_1960,NW Ridge-W Ridge,True,1st,1
1,ANN269301_1969,NW Ridge-W Ridge,True,2nd,1
2,ANN273101_1973,W Ridge-N Face,True,3rd,1
3,ANN278301_1978,N Face-W Ridge,False,,1
4,ANN279301_1979,N Face-W Ridge,False,,1


In [28]:
df.drop(['route1', 'success1', 'ascent1', 'route2', 'success2', 'ascent2', 'route3', 'success3', 'ascent3', 'route4',
				 'success4', 'ascent4'], axis=1, inplace=True)

In [29]:
df.head()

Unnamed: 0,expid,peakid,year,season,sponsor,claimed,disputed,approach,bcdate,smtdate,...,achievment,agency,comrte,stdrte,primrte,primmem,primref,primid,chksum,host_id
0,ANN260101_1960,ANN2,1960,1,,False,False,Marshyangdi->Hongde->Sabje Khola,1960-03-15,1960-05-17,...,,,,,False,False,,,2442047,0
1,ANN269301_1969,ANN2,1969,3,Mountaineering Club of Slovenia,False,False,Marshyangdi->Hongde->Sabje Khola,1969-09-25,1969-10-22,...,,,,,False,False,,,2445501,0
2,ANN273101_1973,ANN2,1973,1,Sangaku Doshikai Annapurna II Expedition 1973,False,False,Marshyangdi->Pisang->Salatang Khola,1973-03-16,1973-05-06,...,,,,,False,False,,,2446797,0
3,ANN278301_1978,ANN2,1978,3,British Annapurna II Expedition,False,False,Marshyangdi->Pisang->Salatang Khola,1978-09-08,1978-10-02,...,,,,,False,False,,,2448822,0
4,ANN279301_1979,ANN2,1979,3,,False,False,Pokhara->Marshyangdi->Pisang->Sabje Khola,,1979-10-18,...,,,,,False,False,,,2449204,0


In [30]:
df.drop(['smtdays', 'totdays', 'totmembers', 'smtmembers', 'mdeaths', 'tothired', 'smthired', 'hdeaths', 'nohired', 'season'],
				axis=1, inplace=True)

In [31]:
df.head()

Unnamed: 0,expid,peakid,year,sponsor,claimed,disputed,approach,bcdate,smtdate,smttime,...,achievment,agency,comrte,stdrte,primrte,primmem,primref,primid,chksum,host_id
0,ANN260101_1960,ANN2,1960,,False,False,Marshyangdi->Hongde->Sabje Khola,1960-03-15,1960-05-17,1530.0,...,,,,,False,False,,,2442047,0
1,ANN269301_1969,ANN2,1969,Mountaineering Club of Slovenia,False,False,Marshyangdi->Hongde->Sabje Khola,1969-09-25,1969-10-22,1800.0,...,,,,,False,False,,,2445501,0
2,ANN273101_1973,ANN2,1973,Sangaku Doshikai Annapurna II Expedition 1973,False,False,Marshyangdi->Pisang->Salatang Khola,1973-03-16,1973-05-06,2030.0,...,,,,,False,False,,,2446797,0
3,ANN278301_1978,ANN2,1978,British Annapurna II Expedition,False,False,Marshyangdi->Pisang->Salatang Khola,1978-09-08,1978-10-02,,...,,,,,False,False,,,2448822,0
4,ANN279301_1979,ANN2,1979,,False,False,Pokhara->Marshyangdi->Pisang->Sabje Khola,,1979-10-18,,...,,,,,False,False,,,2449204,0
