# Peak Dataset EDA

In [1]:
import sys
from pathlib import Path

import pandas as pd

In [2]:
# Add parent directory to Python path
sys.path.append(str(Path().resolve().parent))

In [3]:
from src.utils.preprocessing import *

In [4]:
file_path = str(Path().resolve().parent / "data/raw/peaks.DBF")
df = read_dbf(file_path)

In [5]:
df.dtypes

peakid        object
pkname        object
pkname2       object
location      object
heightm        int64
heightf        int64
himal          int64
region         int64
open            bool
unlisted        bool
trekking        bool
trekyear      object
restrict      object
phost          int64
pstatus        int64
peakmemo      object
pyear         object
pseason        int64
pexpid        object
psmtdate      object
pcountry      object
psummiters    object
psmtnote      object
refermemo     object
photomemo     object
dtype: object

In [6]:
df.head()

Unnamed: 0,peakid,pkname,pkname2,location,heightm,heightf,himal,region,open,unlisted,...,peakmemo,pyear,pseason,pexpid,psmtdate,pcountry,psummiters,psmtnote,refermemo,photomemo
0,AMAD,Ama Dablam,Amai Dablang,Khumbu Himal,6814,22356,12,2,True,False,...,"Other map altitudes:\r\n 6814m - HMG-MT, HMG...",1961,1,AMAD61101,Mar 13,"New Zealand, USA, UK","Mike Gill, Wally Romanes, Barry Bishop, Michae...",,,W Face (High 126:5 May 1993)\r\nSE Face (High ...
1,AMPG,Amphu Gyabjen,Amphu Gyabien,Khumbu Himal (N of Ama Dablam),5630,18471,12,2,True,False,...,"Other map altitudes:\r\n 5630m - HMG-Finn, N...",1953,1,AMPG53101,Apr 11,UK,"John Hunt, Tom Bourdillon",,,
2,ANN1,Annapurna I,,Annapurna Himal,8091,26545,1,5,True,False,...,"Other map altitudes:\r\n 8091m - HMG-MT, HMG...",1950,1,ANN150101,Jun 03,France,"Maurice Herzog, Louis Lachenal",,Dyhrenfurth history 1950-1977 (MM 58:44-47 Nov...,S Face (High 122:3 Jan 1993) (Beghin accident)...
3,ANN2,Annapurna II,,Annapurna Himal,7937,26040,1,5,True,False,...,"Other map altitudes:\r\n 7937m - HMG-MT, HMG...",1960,1,ANN260101,May 17,"UK, Nepal","Richard Grant, Chris Bonington, Ang Nyima Sherpa",,Dyhrenfurth history 1960-1976 (MM 51:36-37 Sep...,N Face (MM 51:36 Sep 1976)
4,ANN3,Annapurna III,,Annapurna Himal,7555,24787,1,5,True,False,...,"Other map altitudes:\r\n 7555m - HMG-MT, HMG...",1961,1,ANN361101,May 06,India,"Mohan S. Kohli, Sonam Gyatso, Sonam Girmi",,,S Side (MM 125:11 Jan 1989)\r\nSW Face (MM 71:...


In [8]:
df.describe()

Unnamed: 0,heightm,heightf,himal,region,phost,pstatus,pseason
count,490.0,490.0,490.0,490.0,490.0,490.0,490.0
mean,6645.828571,21803.914286,10.310204,3.767347,2.130612,1.763265,1.577551
std,568.992482,1866.77878,5.555603,2.144798,1.521247,0.425513,1.249583
min,5407.0,17740.0,1.0,1.0,1.0,1.0,0.0
25%,6235.0,20456.0,6.0,2.0,1.0,2.0,1.0
50%,6554.5,21504.0,11.0,3.0,1.0,2.0,1.0
75%,6892.0,22611.75,15.0,6.0,4.0,2.0,3.0
max,8849.0,29032.0,20.0,7.0,6.0,2.0,4.0


## Relationships
### Peaks & Locations

In [45]:
print(df.groupby('peakid', as_index=False).location.nunique().location.min())
print(df.groupby('peakid', as_index=False).location.nunique().location.max())

0
1


In [46]:
print(df.groupby('location', as_index=False).peakid.nunique().peakid.min())
print(df.groupby('location', as_index=False).peakid.nunique().peakid.max())

1
6


> A peak is associated with at most one location, and a location can have multiple peaks (on-to-many)

### Location & Mountain

In [47]:
print(df.groupby('location', as_index=False).himal.nunique().himal.min())
print(df.groupby('location', as_index=False).himal.nunique().himal.max())

1
1


In [48]:
print(df.groupby('himal', as_index=False).location.nunique().location.min())
print(df.groupby('himal', as_index=False).location.nunique().location.max())

6
64


> A location is associated with exactly one mountain, and a mountain can have multiple locations (one-to-many)

### Location & Region

In [50]:
print(df.groupby('location', as_index=False).region.nunique().region.min())
print(df.groupby('location', as_index=False).region.nunique().region.max())

1
1


In [51]:
print(df.groupby('region', as_index=False).location.nunique().location.min())
print(df.groupby('region', as_index=False).location.nunique().location.max())

35
113


> A location is associated with exactly one region, and a region can have multiple locations (one-to-many)

### Mountains & Regions

In [52]:
print(df.groupby('himal', as_index=False).region.nunique().region.min())
print(df.groupby('himal', as_index=False).region.nunique().region.max())

1
2


In [53]:
print(df.groupby('himal', as_index=False).region.nunique().region.min())
print(df.groupby('region', as_index=False).himal.nunique().himal.max())

1
5


> There is a many-to-many relationship between mountains and regions

## Primary Key

In [55]:
assert df.peakid.nunique() == df.shape[0]
assert df.peakid.isna().sum() == 0

> The _peakid_ variable fits the criteria for a primary key (uniqueness, not null)