In [5]:
# import required libraries
import pandas as pd
import numpy as np

# adjust the default display options for columns and rows
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 10)

# main plotting library to be used
import matplotlib.pyplot as plt
plt.style.use('ggplot')

# for better plot visuals
import seaborn as sns 

In [6]:
df = pd.read_csv("../data/coaster_db.csv")

df.head(2)

Unnamed: 0,coaster_name,Length,Speed,Location,Status,Opening date,Type,Manufacturer,Height restriction,Model,Height,Inversions,Lift/launch system,Cost,Trains,Park section,Duration,Capacity,G-force,Designer,Max vertical angle,Drop,Soft opening date,Fast Lane available,Replaced,Track layout,Fastrack available,Soft opening date.1,Closing date,Opened,Replaced by,Website,Flash Pass Available,Must transfer from wheelchair,Theme,Single rider line available,Restraint Style,Flash Pass available,Acceleration,Restraints,Name,year_introduced,latitude,longitude,Type_Main,opening_date_clean,speed1,speed2,speed1_value,speed1_unit,speed_mph,height_value,height_unit,height_ft,Inversions_clean,Gforce_clean
0,Switchback Railway,600 ft (180 m),6 mph (9.7 km/h),Coney Island,Removed,"June 16, 1884",Wood,LaMarcus Adna Thompson,,Lift Packed,50 ft (15 m),,gravity,,,Coney Island Cyclone Site,1:00,1600 riders per hour,2.9,LaMarcus Adna Thompson,30°,43 ft (13 m),,,,Gravity pulled coaster,,,,,,,,,,,,,,,,1884,40.574,-73.978,Wood,1884-06-16,6 mph,9.7 km/h,6.0,mph,6.0,50.0,ft,,0,2.9
1,Flip Flap Railway,,,Sea Lion Park,Removed,1895,Wood,Lina Beecher,,,,1.0,,,a single car. Riders are arranged 1 across in ...,,,,12.0,Lina Beecher,,,,,,,,,1902.0,,,,,,,,,,,,,1895,40.578,-73.979,Wood,1895-01-01,,,,,,,,,1,12.0


## Step 1: Understand the Data

* Check the `shape` of the data
* Peek at sample data with `df.head()` or `df.describe()` to get stats about the numeric columns
* Get a list of columns with `df.columns`
* Check data types of columns with `df.dtypes`

In [8]:
df.shape

(1087, 56)

In [9]:
df.columns

Index(['coaster_name', 'Length', 'Speed', 'Location', 'Status', 'Opening date',
       'Type', 'Manufacturer', 'Height restriction', 'Model', 'Height',
       'Inversions', 'Lift/launch system', 'Cost', 'Trains', 'Park section',
       'Duration', 'Capacity', 'G-force', 'Designer', 'Max vertical angle',
       'Drop', 'Soft opening date', 'Fast Lane available', 'Replaced',
       'Track layout', 'Fastrack available', 'Soft opening date.1',
       'Closing date', 'Opened', 'Replaced by', 'Website',
       'Flash Pass Available', 'Must transfer from wheelchair', 'Theme',
       'Single rider line available', 'Restraint Style',
       'Flash Pass available', 'Acceleration', 'Restraints', 'Name',
       'year_introduced', 'latitude', 'longitude', 'Type_Main',
       'opening_date_clean', 'speed1', 'speed2', 'speed1_value', 'speed1_unit',
       'speed_mph', 'height_value', 'height_unit', 'height_ft',
       'Inversions_clean', 'Gforce_clean'],
      dtype='object')

In [10]:
df.dtypes

coaster_name         object
Length               object
Speed                object
Location             object
Status               object
                     ...   
height_value        float64
height_unit          object
height_ft           float64
Inversions_clean      int64
Gforce_clean        float64
Length: 56, dtype: object

In [11]:
df.describe()

Unnamed: 0,Inversions,year_introduced,latitude,longitude,speed1_value,speed_mph,height_value,height_ft,Inversions_clean,Gforce_clean
count,932.0,1087.0,812.0,812.0,937.0,937.0,965.0,171.0,1087.0,362.0
mean,1.54721,1994.986201,38.373484,-41.595373,53.850374,48.617289,89.575171,101.996491,1.326587,3.824006
std,2.114073,23.475248,15.516596,72.285227,23.385518,16.678031,136.246444,67.329092,2.030854,0.989998
min,0.0,1884.0,-48.2617,-123.0357,5.0,5.0,4.0,13.1,0.0,0.8
25%,0.0,1989.0,35.03105,-84.5522,40.0,37.3,44.0,51.8,0.0,3.4
50%,0.0,2000.0,40.2898,-76.6536,50.0,49.7,79.0,91.2,0.0,4.0
75%,3.0,2010.0,44.7996,2.7781,63.0,58.0,113.0,131.2,2.0,4.5
max,14.0,2022.0,63.2309,153.4265,240.0,149.1,3937.0,377.3,14.0,12.0


## Step 2: Prepare the Data

- Subset the dataframe by dropping irrelevant columns and rows
- Identify duplicated columns
- Rename columns
- Create feature

### Dropping Columns

- Two options:
    - using `df.drop()` method - i.e., `df.drop(['col1','col2',...,'coln'], axis=1)`
    - or ->

In [12]:
# copy the column values from df.columns and comment out the columns that should be dropped
# rewrite the existing dataframe
# ensure to use `copy()` method so the interpereter knows that we are making no references to
# the prior version of the df object

df = df[['coaster_name', 
    # 'Length', 'Speed', 
    'Location', 'Status', 
    #'Opening date',
    #'Type',
    'Manufacturer', 
    #'Height restriction', 'Model', 'Height',
    # 'Inversions', 'Lift/launch system', 'Cost', 'Trains', 'Park section',
    # 'Duration', 'Capacity', 'G-force', 'Designer', 'Max vertical angle',
    # 'Drop', 'Soft opening date', 'Fast Lane available', 'Replaced',
    # 'Track layout', 'Fastrack available', 'Soft opening date.1',
    # 'Closing date', 'Opened', 'Replaced by', 'Website',
    # 'Flash Pass Available', 'Must transfer from wheelchair', 'Theme',
    # 'Single rider line available', 'Restraint Style',
    # 'Flash Pass available', 'Acceleration', 'Restraints', 'Name',
    'year_introduced', 'latitude', 'longitude', 'Type_Main',
       'opening_date_clean', 
    #'speed1', 'speed2', 'speed1_value', 'speed1_unit',
    'speed_mph', 
    # 'height_value', 'height_unit', 
    'height_ft',
    'Inversions_clean', 'Gforce_clean']].copy()

# check the shape to confirm the column count
df.shape

(1087, 13)

### Checking for data type inconsistencies

In [13]:
df.dtypes

coaster_name           object
Location               object
Status                 object
Manufacturer           object
year_introduced         int64
                       ...   
opening_date_clean     object
speed_mph             float64
height_ft             float64
Inversions_clean        int64
Gforce_clean          float64
Length: 13, dtype: object

In [14]:
# opening_date_clean should be type of date not string/object

df['opening_date_clean']

0       1884-06-16
1       1895-01-01
2              NaN
3       1901-01-01
4       1901-01-01
           ...    
1082           NaN
1083    2022-01-01
1084    2016-06-16
1085           NaN
1086    2022-01-01
Name: opening_date_clean, Length: 1087, dtype: object

In [15]:
pd.to_datetime(df['opening_date_clean'])

0      1884-06-16
1      1895-01-01
2             NaT
3      1901-01-01
4      1901-01-01
          ...    
1082          NaT
1083   2022-01-01
1084   2016-06-16
1085          NaT
1086   2022-01-01
Name: opening_date_clean, Length: 1087, dtype: datetime64[ns]

In [17]:
# convert the opening_date_clean data type to date type
df['opening_date_clean'] = pd.to_datetime(df['opening_date_clean'])

### Renaming Columns
- Assume that the required naming convention is a mix of oftitlecase/snakecase based on the length of the column
- and clean or Clean need to removed from the column name
- For example:
    - opening_date_clean => Opening_Date
    - Gforce_Clean => Gforce

In [18]:
# test out the logic to be used
col = 'opening_date_clean'
col = [x for x in col.split('_') if x not in ['clean', 'Clean']]
col = "_".join(x.title() for x in col)
col 

'Opening_Date'