In [2]:
from pathlib import Path

import pandas as pd

In [6]:
# Path to data directory
data_dir = Path.cwd() / "course_materials" / "data"

### States Dataset

In [12]:
# American States dataset
states_df = pd.read_csv(data_dir / "states.csv")

states_df.head()

Unnamed: 0,State,Abbrev,Code
0,Alabama,Ala.,AL
1,Alaska,Alaska,AK
2,Arizona,Ariz.,AZ
3,Arkansas,Ark.,AR
4,California,Calif.,CA


### King's Country Housing Dataset

This dataset includes the home sales from 2014-2015 in King County, WA (the county Seattle is located in)
* `id` - house's unique id
* `date` - sale date
* `price` - sale price
* `bedrooms` - number of bedrooms
* `bathrooms` - numbers of bathrooms
* `sqft_living` - living space square footage 
* `sqft_lot` - total lot square footage
* `floors` - numbers of floors
* `waterfront` - is the house waterfront (1) or not (0)
* `view` - rating from 0 to 4 of how good the view from the house is
* `condition` - rating from 1 (poor) to 5 (very good) of the condition of the house
* `grade` - rating from 1-13 representing the construction quality of improvements. 1-3 Falls short of minimum building standards (cabins, etc.) 7 is avg grade, 11-13 have high-quality design & construction
* `sqft_above` - square footage of the interior that is above ground level
* `sqft_basement` - square footage of the interior that is below ground level
* `yr_built` - year the house was initially built
* `yr_renovated` - The year of the house’s last renovation (if any)
* `zipcode` - zipcode that the house is located in
* `lat` - the property's latitude
* `long` - the property's longitude
* `sqft_living15` - average interior space square footage of the nearest 15 neighbors
* `sqft_lot15` - average lot square footage of the nearest 15 neighbors

In [14]:
# King's County housing dataset
kc_houses_df = pd.read_csv(data_dir / "kc_house_data.csv")

kc_houses_df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [15]:
type(kc_houses_df)

pandas.core.frame.DataFrame

In [16]:
kc_houses_df.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')

In [17]:
kc_houses_df.index

RangeIndex(start=0, stop=21613, step=1)

In [18]:
kc_houses_df.shape

(21613, 21)

In [19]:
kc_houses_df.size

453873

In [22]:
kc_houses_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21613 non-null  int64  
 1   date           21613 non-null  object 
 2   price          21613 non-null  float64
 3   bedrooms       21613 non-null  int64  
 4   bathrooms      21613 non-null  float64
 5   sqft_living    21613 non-null  int64  
 6   sqft_lot       21613 non-null  int64  
 7   floors         21613 non-null  float64
 8   waterfront     21613 non-null  int64  
 9   view           21613 non-null  int64  
 10  condition      21613 non-null  int64  
 11  grade          21613 non-null  int64  
 12  sqft_above     21613 non-null  int64  
 13  sqft_basement  21613 non-null  int64  
 14  yr_built       21613 non-null  int64  
 15  yr_renovated   21613 non-null  int64  
 16  zipcode        21613 non-null  int64  
 17  lat            21613 non-null  float64
 18  long  

In [24]:
kc_houses_df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


### Titanic Passenger Dataset

* `pclass` - Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
* `survived` - Survival (0 = No; 1 = Yes)
* `name` - Name
* `sex` - Sex
* `age` - Age
* `sibsp` - Number of Siblings/Spouses Aboard
* `parch` - Number of Parents/Children Aboard
* `ticket` - Ticket Number
* `fare` - Passenger Fare
* `cabin` - Cabin
* `embarked` - Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)
* `boat` - Lifeboat (if survived)
* `body` - Body number (if did not survive and body was recovered)
* `home.dest` - Home/Destination

In [31]:
# Titanic Passenger dataset
titanic_df = pd.read_csv(data_dir / "titanic.csv")

titanic_df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2,?,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11,?,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,?,?,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,?,135,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,?,?,"Montreal, PQ / Chesterville, ON"


In [27]:
titanic_df.columns

Index(['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket',
       'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest'],
      dtype='object')

### Netflix Dataset

In [36]:
# Netflix dataset
netflix_df = pd.read_csv(data_dir / "netflix_titles.csv", sep="|", index_col=0)

netflix_df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [35]:
netflix_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 894.5+ KB


### Country Population Dataset

In [38]:
# Country Population dataset
population_df = pd.read_csv(data_dir / "nst-est2020.csv")

population_df.head()

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,NAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,POPESTIMATE2015,POPESTIMATE2016,POPESTIMATE2017,POPESTIMATE2018,POPESTIMATE2019,POPESTIMATE042020,POPESTIMATE2020
0,10,0,0,0,United States,308745538,308758105,309327143,311583481,313877662,316059947,318386329,320738994,323071755,325122128,326838199,328329953,329398742,329484123
1,20,1,0,0,Northeast Region,55317240,55318414,55380764,55608318,55782661,55912775,56021339,56052790,56063777,56083383,56084543,56002934,55924275,55849869
2,20,2,0,0,Midwest Region,66927001,66929737,66975328,67164092,67348275,67576524,67765576,67885682,68018175,68160342,68263019,68340091,68357895,68316744
3,20,3,0,0,South Region,114555744,114563042,114869421,116019483,117264196,118397213,119666248,121049223,122419547,123611036,124649156,125686544,126494232,126662754
4,20,4,0,0,West Region,71945553,71946912,72101630,72791588,73482530,74173435,74933166,75751299,76570256,77267367,77841481,78300384,78622340,78654756


In [39]:
population_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57 entries, 0 to 56
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   SUMLEV             57 non-null     int64 
 1   REGION             57 non-null     object
 2   DIVISION           57 non-null     object
 3   STATE              57 non-null     int64 
 4   NAME               57 non-null     object
 5   CENSUS2010POP      57 non-null     int64 
 6   ESTIMATESBASE2010  57 non-null     int64 
 7   POPESTIMATE2010    57 non-null     int64 
 8   POPESTIMATE2011    57 non-null     int64 
 9   POPESTIMATE2012    57 non-null     int64 
 10  POPESTIMATE2013    57 non-null     int64 
 11  POPESTIMATE2014    57 non-null     int64 
 12  POPESTIMATE2015    57 non-null     int64 
 13  POPESTIMATE2016    57 non-null     int64 
 14  POPESTIMATE2017    57 non-null     int64 
 15  POPESTIMATE2018    57 non-null     int64 
 16  POPESTIMATE2019    57 non-null     int64 
 17 