# **Setup**

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

# Set Style
plt.style.use('ggplot')

# **Data Import**

In [2]:
df = pd.read_csv('coaster_db.csv')
print(df.shape)
df.head()

(1087, 56)


Unnamed: 0,coaster_name,Length,Speed,Location,Status,Opening date,Type,Manufacturer,Height restriction,Model,...,speed1,speed2,speed1_value,speed1_unit,speed_mph,height_value,height_unit,height_ft,Inversions_clean,Gforce_clean
0,Switchback Railway,600 ft (180 m),6 mph (9.7 km/h),Coney Island,Removed,"June 16, 1884",Wood,LaMarcus Adna Thompson,,Lift Packed,...,6 mph,9.7 km/h,6.0,mph,6.0,50.0,ft,,0,2.9
1,Flip Flap Railway,,,Sea Lion Park,Removed,1895,Wood,Lina Beecher,,,...,,,,,,,,,1,12.0
2,Switchback Railway (Euclid Beach Park),,,"Cleveland, Ohio, United States",Closed,,Other,,,,...,,,,,,,,,0,
3,Loop the Loop (Coney Island),,,Other,Removed,1901,Steel,Edwin Prescott,,,...,,,,,,,,,1,
4,Loop the Loop (Young's Pier),,,Other,Removed,1901,Steel,Edwin Prescott,,,...,,,,,,,,,1,


In [3]:
# Check Columns
df.columns

Index(['coaster_name', 'Length', 'Speed', 'Location', 'Status', 'Opening date',
       'Type', 'Manufacturer', 'Height restriction', 'Model', 'Height',
       'Inversions', 'Lift/launch system', 'Cost', 'Trains', 'Park section',
       'Duration', 'Capacity', 'G-force', 'Designer', 'Max vertical angle',
       'Drop', 'Soft opening date', 'Fast Lane available', 'Replaced',
       'Track layout', 'Fastrack available', 'Soft opening date.1',
       'Closing date', 'Opened', 'Replaced by', 'Website',
       'Flash Pass Available', 'Must transfer from wheelchair', 'Theme',
       'Single rider line available', 'Restraint Style',
       'Flash Pass available', 'Acceleration', 'Restraints', 'Name',
       'year_introduced', 'latitude', 'longitude', 'Type_Main',
       'opening_date_clean', 'speed1', 'speed2', 'speed1_value', 'speed1_unit',
       'speed_mph', 'height_value', 'height_unit', 'height_ft',
       'Inversions_clean', 'Gforce_clean'],
      dtype='object')

In [4]:
df.dtypes

coaster_name                      object
Length                            object
Speed                             object
Location                          object
Status                            object
Opening date                      object
Type                              object
Manufacturer                      object
Height restriction                object
Model                             object
Height                            object
Inversions                       float64
Lift/launch system                object
Cost                              object
Trains                            object
Park section                      object
Duration                          object
Capacity                          object
G-force                           object
Designer                          object
Max vertical angle                object
Drop                              object
Soft opening date                 object
Fast Lane available               object
Replaced        

In [5]:
df.describe()

Unnamed: 0,Inversions,year_introduced,latitude,longitude,speed1_value,speed_mph,height_value,height_ft,Inversions_clean,Gforce_clean
count,932.0,1087.0,812.0,812.0,937.0,937.0,965.0,171.0,1087.0,362.0
mean,1.54721,1994.986201,38.373484,-41.595373,53.850374,48.617289,89.575171,101.996491,1.326587,3.824006
std,2.114073,23.475248,15.516596,72.285227,23.385518,16.678031,136.246444,67.329092,2.030854,0.989998
min,0.0,1884.0,-48.2617,-123.0357,5.0,5.0,4.0,13.1,0.0,0.8
25%,0.0,1989.0,35.03105,-84.5522,40.0,37.3,44.0,51.8,0.0,3.4
50%,0.0,2000.0,40.2898,-76.6536,50.0,49.7,79.0,91.2,0.0,4.0
75%,3.0,2010.0,44.7996,2.7781,63.0,58.0,113.0,131.2,2.0,4.5
max,14.0,2022.0,63.2309,153.4265,240.0,149.1,3937.0,377.3,14.0,12.0


# **Data Preparation**

## **Drop Columns**

In [9]:
df = df.loc[:,['coaster_name', 
        #   'Length', 'Speed', 
        'Location', 'Status', 'Opening date','Type',
        'Manufacturer', 
        # 'Height restriction', 'Model', 'Height', 'Inversions', 'Lift/launch system', 'Cost', 'Trains',
        # 'Park section', 'Duration', 'Capacity', 'G-force', 'Designer', 'Max vertical angle',
    #    'Drop', 'Soft opening date', 'Fast Lane available', 'Replaced',
    #    'Track layout', 'Fastrack available', 'Soft opening date.1',
    #    'Closing date', 
    # 'Opened', 'Replaced by', 'Website',
    #    'Flash Pass Available', 'Must transfer from wheelchair', 'Theme',
    #    'Single rider line available', 'Restraint Style',
    #    'Flash Pass available', 'Acceleration', 'Restraints', 'Name',
       'year_introduced', 'latitude', 'longitude', 'Type_Main', 'opening_date_clean', 
    #    'speed1', 'speed2', 'speed1_value', 'speed1_unit',
       'speed_mph', 
    #    'height_value', 'height_unit', 
    'height_ft','Inversions_clean', 'Gforce_clean']].copy()
df.head()

Unnamed: 0,coaster_name,Location,Status,Opening date,Type,Manufacturer,year_introduced,latitude,longitude,Type_Main,opening_date_clean,speed_mph,height_ft,Inversions_clean,Gforce_clean
0,Switchback Railway,Coney Island,Removed,"June 16, 1884",Wood,LaMarcus Adna Thompson,1884,40.574,-73.978,Wood,1884-06-16,6.0,,0,2.9
1,Flip Flap Railway,Sea Lion Park,Removed,1895,Wood,Lina Beecher,1895,40.578,-73.979,Wood,1895-01-01,,,1,12.0
2,Switchback Railway (Euclid Beach Park),"Cleveland, Ohio, United States",Closed,,Other,,1896,41.58,-81.57,Other,,,,0,
3,Loop the Loop (Coney Island),Other,Removed,1901,Steel,Edwin Prescott,1901,40.5745,-73.978,Steel,1901-01-01,,,1,
4,Loop the Loop (Young's Pier),Other,Removed,1901,Steel,Edwin Prescott,1901,39.3538,-74.4342,Steel,1901-01-01,,,1,


## **Rename Columns**

In [10]:
df.columns

Index(['coaster_name', 'Location', 'Status', 'Opening date', 'Type',
       'Manufacturer', 'year_introduced', 'latitude', 'longitude', 'Type_Main',
       'opening_date_clean', 'speed_mph', 'height_ft', 'Inversions_clean',
       'Gforce_clean'],
      dtype='object')

In [13]:
df.rename(columns={'Inversions_clean':'Inversions','Gforce_clean':'Gforce'},inplace=True)
df.columns

Index(['coaster_name', 'Location', 'Status', 'Opening date', 'Type',
       'Manufacturer', 'year_introduced', 'latitude', 'longitude', 'Type_Main',
       'opening_date_clean', 'speed_mph', 'height_ft', 'Inversions', 'Gforce'],
      dtype='object')

## **Check Missing Values**

In [15]:
df.shape

(1087, 15)

In [14]:
df.isna().sum()

coaster_name            0
Location                0
Status                213
Opening date          250
Type                    0
Manufacturer           59
year_introduced         0
latitude              275
longitude             275
Type_Main               0
opening_date_clean    250
speed_mph             150
height_ft             916
Inversions              0
Gforce                725
dtype: int64

## **Check Duplicated Values**

In [17]:
# Check For Duplicates
df.loc[df.duplicated()]

Unnamed: 0,coaster_name,Location,Status,Opening date,Type,Manufacturer,year_introduced,latitude,longitude,Type_Main,opening_date_clean,speed_mph,height_ft,Inversions,Gforce


In [22]:
# Check For Duplicates In Individual Column
df.loc[df['coaster_name'].duplicated(),'coaster_name']

43                   Crystal Beach Cyclone
60                             Derby Racer
61             Blue Streak (Conneaut Lake)
167          Big Thunder Mountain Railroad
237      Thunder Run (Canada's Wonderland)
                       ...                
1063                    Lil' Devil Coaster
1064    Little Dipper (Conneaut Lake Park)
1080                            Iron Gwazi
1082               American Dreier Looping
1084             Tron Lightcycle Power Run
Name: coaster_name, Length: 97, dtype: object

In [26]:
# Check For Some Duplicated Cosater Names
df.query('coaster_name=="Crystal Beach Cyclone"')

Unnamed: 0,coaster_name,Location,Status,Opening date,Type,Manufacturer,year_introduced,latitude,longitude,Type_Main,opening_date_clean,speed_mph,height_ft,Inversions,Gforce
39,Crystal Beach Cyclone,Crystal Beach Park,Removed,1926,Wood,Traver Engineering,1926,42.8617,-79.0598,Wood,1926-01-01,60.0,,0,4.0
43,Crystal Beach Cyclone,Crystal Beach Park,Removed,1926,Wood,Traver Engineering,1927,42.8617,-79.0598,Wood,1926-01-01,60.0,,0,4.0


In [27]:
# Check For Some Duplicated Cosater Names
df.query('coaster_name=="Iron Gwazi"')

Unnamed: 0,coaster_name,Location,Status,Opening date,Type,Manufacturer,year_introduced,latitude,longitude,Type_Main,opening_date_clean,speed_mph,height_ft,Inversions,Gforce
482,Iron Gwazi,Busch Gardens Tampa Bay,Under construction,,Steel – Wooden,Rocky Mountain Construction,1999,28.0339,-82.4231,Steel,,76.0,,2,
1080,Iron Gwazi,Busch Gardens Tampa Bay,Under construction,,Steel – Wooden,Rocky Mountain Construction,2022,28.0339,-82.4231,Steel,,76.0,,2,


In [42]:
# Check Duplicates Across Specified Columns
print(df.duplicated(subset=['coaster_name','Location']).sum())
print(df.duplicated(subset=['coaster_name','Location','Opening date','latitude','longitude']).sum())

# Get Duplicated Rows
df[df.duplicated(subset=['coaster_name','Location','Opening date','latitude','longitude'])].sort_values(by='coaster_name',ascending=True)

97
97


Unnamed: 0,coaster_name,Location,Status,Opening date,Type,Manufacturer,year_introduced,latitude,longitude,Type_Main,opening_date_clean,speed_mph,height_ft,Inversions,Gforce
780,Afterburn (roller coaster),Carowinds,Operating,"March 20, 1999",Steel – Inverted,Bolliger & Mabillard,2009,35.1003,-80.9409,Steel,1999-03-20,62.0,,6,
270,Alpine Bobsled,Other,,,Steel,Intamin,1989,,,Steel,,35.0,,0,
435,Alpine Bobsled,Other,,,Steel,Intamin,1998,,,Steel,,35.0,,0,
686,American Dreier Looping,Other,,,Steel,Anton Schwarzkopf,2005,,,Steel,,53.0,,3,4.7
738,American Dreier Looping,Other,,,Steel,Anton Schwarzkopf,2007,,,Steel,,53.0,,3,4.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296,Ultra Twister (Six Flags),Other,,,Steel – Pipeline,TOGO,1990,,,Steel,,44.0,,3,
1044,Untamed (Walibi Holland),Walibi Holland,Operating,"July 1, 2019",Steel,Rocky Mountain Construction,2019,52.4428,5.7608,Steel,2019-07-01,57.2,,5,
347,Wild Mouse (Idlewild),Idlewild and Soak Zone,Operating,1985 (original)1993 (current location)[1],Steel,Vekoma,1993,40.2598,-79.2799,Steel,1985-01-01,30.0,,0,
749,Wipeout (roller coaster),Pleasurewood Hills,Operating,2007,Steel – Shuttle – Boomerang,Vekoma,2007,52.5075,1.7434,Steel,2007-01-01,50.0,,6,


In [40]:
# Get Non-Duplicated Rows
df[~df.duplicated(subset=['coaster_name','Location','Opening date','latitude','longitude'])]

Unnamed: 0,coaster_name,Location,Status,Opening date,Type,Manufacturer,year_introduced,latitude,longitude,Type_Main,opening_date_clean,speed_mph,height_ft,Inversions,Gforce
0,Switchback Railway,Coney Island,Removed,"June 16, 1884",Wood,LaMarcus Adna Thompson,1884,40.5740,-73.9780,Wood,1884-06-16,6.0,,0,2.9
1,Flip Flap Railway,Sea Lion Park,Removed,1895,Wood,Lina Beecher,1895,40.5780,-73.9790,Wood,1895-01-01,,,1,12.0
2,Switchback Railway (Euclid Beach Park),"Cleveland, Ohio, United States",Closed,,Other,,1896,41.5800,-81.5700,Other,,,,0,
3,Loop the Loop (Coney Island),Other,Removed,1901,Steel,Edwin Prescott,1901,40.5745,-73.9780,Steel,1901-01-01,,,1,
4,Loop the Loop (Young's Pier),Other,Removed,1901,Steel,Edwin Prescott,1901,39.3538,-74.4342,Steel,1901-01-01,,,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1079,Ice Breaker (roller coaster),SeaWorld Orlando,Under construction,February 2022,Steel – Launched,Premier Rides,2022,28.4088,-81.4633,Steel,2022-02-01,52.0,,0,
1081,Leviathan (Sea World),Sea World,Under construction,Easter 2022,Wood,Martin & Vleminckx,2022,-27.9574,153.4263,Wood,2022-01-01,49.7,105.0,0,
1083,Pantheon (roller coaster),Busch Gardens Williamsburg,Under construction,2022,Steel – Launched,Intamin,2022,37.2339,-76.6426,Steel,2022-01-01,73.0,,2,
1085,Tumbili,Kings Dominion,Under construction,,Steel – 4th Dimension – Wing Coaster,S&S – Sansei Technologies,2022,,,Steel,,34.0,,0,
