Cleaning Roller Coaster Dataset. 

Objectives:
- Clean any missing, irregular, unnecessary, or inconsistent data.
- Detect missing data (null values) and decide how to handle it.
- Make all units of measurement (height, speed, etc) universal. Just using MPH and ft.
- Drop unnecessary columns
- Detect inconsistent data and how to handle it.

In [21]:
import pandas as pd
import matplotlib as plt


In [22]:
df = pd.read_csv("/Users/benjaminteter/Desktop/Roller_Coaster_Project/coaster_db.csv")

In [23]:
pd.reset_option('^display.', silent=True)

In [24]:
df.head(25)

Unnamed: 0,coaster_name,Length,Speed,Location,Status,Opening date,Type,Manufacturer,Height restriction,Model,...,speed1,speed2,speed1_value,speed1_unit,speed_mph,height_value,height_unit,height_ft,Inversions_clean,Gforce_clean
0,Switchback Railway,600 ft (180 m),6 mph (9.7 km/h),Coney Island,Removed,"June 16, 1884",Wood,LaMarcus Adna Thompson,,Lift Packed,...,6 mph,9.7 km/h,6.0,mph,6.0,50.0,ft,,0,2.9
1,Flip Flap Railway,,,Sea Lion Park,Removed,1895,Wood,Lina Beecher,,,...,,,,,,,,,1,12.0
2,Switchback Railway (Euclid Beach Park),,,"Cleveland, Ohio, United States",Closed,,Other,,,,...,,,,,,,,,0,
3,Loop the Loop (Coney Island),,,Other,Removed,1901,Steel,Edwin Prescott,,,...,,,,,,,,,1,
4,Loop the Loop (Young's Pier),,,Other,Removed,1901,Steel,Edwin Prescott,,,...,,,,,,,,,1,
5,Cannon Coaster,,,Coney Island,Removed,1902,Wood,George Francis Meyer,,,...,,,,,,40.0,ft,,0,
6,Leap-The-Dips,"1,452 ft (443 m)",10 mph (16 km/h),Lakemont Park,Operating,1902,Wood – Side friction,Federal Construction Company,,,...,10 mph,16 km/h,10.0,mph,10.0,41.0,ft,,0,
7,Figure Eight (Euclid Beach Park),,,"Cleveland, Ohio, United States",Closed,,Other,,,,...,,,,,,,,,0,
8,Drop the Dip,,,Coney Island,Removed,"June 6, 1907",Other,Arthur Jarvis,,,...,,,,,,60.0,ft,,0,
9,Scenic Railway (Euclid Beach Park),,,"Cleveland, Ohio, United States",Closed,,Other,,,,...,,,,,,,,,0,


In [25]:
df.columns

Index(['coaster_name', 'Length', 'Speed', 'Location', 'Status', 'Opening date',
       'Type', 'Manufacturer', 'Height restriction', 'Model', 'Height',
       'Inversions', 'Lift/launch system', 'Cost', 'Trains', 'Park section',
       'Duration', 'Capacity', 'G-force', 'Designer', 'Max vertical angle',
       'Drop', 'Soft opening date', 'Fast Lane available', 'Replaced',
       'Track layout', 'Fastrack available', 'Soft opening date.1',
       'Closing date', 'Opened', 'Replaced by', 'Website',
       'Flash Pass Available', 'Must transfer from wheelchair', 'Theme',
       'Single rider line available', 'Restraint Style',
       'Flash Pass available', 'Acceleration', 'Restraints', 'Name',
       'year_introduced', 'latitude', 'longitude', 'Type_Main',
       'opening_date_clean', 'speed1', 'speed2', 'speed1_value', 'speed1_unit',
       'speed_mph', 'height_value', 'height_unit', 'height_ft',
       'Inversions_clean', 'Gforce_clean'],
      dtype='object')

In [26]:
#remove any non operational coasters
# cleaned table now called df_operational. Includes only "Operating" roller coasters

df_operating = df[df['Status'] == 'Operating']
df_operating

Unnamed: 0,coaster_name,Length,Speed,Location,Status,Opening date,Type,Manufacturer,Height restriction,Model,...,speed1,speed2,speed1_value,speed1_unit,speed_mph,height_value,height_unit,height_ft,Inversions_clean,Gforce_clean
6,Leap-The-Dips,"1,452 ft (443 m)",10 mph (16 km/h),Lakemont Park,Operating,1902,Wood – Side friction,Federal Construction Company,,,...,10 mph,16 km/h,10.0,mph,10.0,41.0,ft,,0,
13,Racer (1910 roller coaster),"4,500 ft (1,400 m)",40 mph (64 km/h),Kennywood,Operating,1927,Wood – Racing,Charlie Mach,46 in (117 cm),Racing,...,40 mph,64 km/h,40.0,mph,40.0,72.5,ft,,0,
15,The Great Scenic Railway,"967 m (3,173 ft)",60 km/h (37 mph),Luna Park Melbourne,Operating,December 1912,Wood,LaMarcus Thompson,100 cm (3 ft 3 in),,...,60 km/h,37 mph,60.0,km/h,37.3,16.0,m,52.5,0,
22,Jack Rabbit (Kennywood),"2,132 ft (650 m)",45 mph (72 km/h),Kennywood,Operating,1920,Wood,Harry C. Baker,42 in (107 cm),,...,45 mph,72 km/h,45.0,mph,45.0,40.0,ft,,0,
23,Jack Rabbit (Seabreeze),"2,130 ft (650 m)",42 mph (68 km/h),Seabreeze Amusement Park,Operating,1920,Wood,Harry C. Baker,48 in (122 cm),,...,42 mph,68 km/h,42.0,mph,42.0,75.0,ft,,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1066,Monster (Gröna Lund),"2,296.6 ft (700.0 m)",55.9 mph (90.0 km/h),Gröna Lund,Operating,"June 2, 2021; 5 months ago",Steel – Inverted,Bolliger & Mabillard,140 cm (4 ft 7 in),Inverted Coaster,...,55.9 mph,90.0 km/h,55.9,mph,55.9,111.5,ft,,3,4.5
1068,Storm Chaser (Paultons Park),,,Paultons Park,Operating,12 April 2021,Steel – Spinning,Mack Rides,100 cm (3 ft 3 in),Spinning Coaster / Sierra Sidewinder,...,,,,,,65.6,ft,,0,
1069,Stunt Pilot (roller coaster),"1,800 ft (550 m)",52 mph (84 km/h),Silverwood Theme Park,Operating,"May 29, 2021",Steel – Single-rail,Rocky Mountain Construction,48 in (122 cm),Raptor – Custom,...,52 mph,84 km/h,52.0,mph,52.0,105.0,ft,,3,
1070,The Ride to Happiness,"3,018.4 ft (920.0 m)",55.9 mph (90.0 km/h),Plopsaland De Panne,Operating,"July 1, 2021",Steel – Spinning,Mack Rides,130 cm (4 ft 3 in),Xtreme Spinning Coaster,...,55.9 mph,90.0 km/h,55.9,mph,55.9,108.3,ft,,6,


In [27]:
df_cleaned = df_operating
df_cleaned

Unnamed: 0,coaster_name,Length,Speed,Location,Status,Opening date,Type,Manufacturer,Height restriction,Model,...,speed1,speed2,speed1_value,speed1_unit,speed_mph,height_value,height_unit,height_ft,Inversions_clean,Gforce_clean
6,Leap-The-Dips,"1,452 ft (443 m)",10 mph (16 km/h),Lakemont Park,Operating,1902,Wood – Side friction,Federal Construction Company,,,...,10 mph,16 km/h,10.0,mph,10.0,41.0,ft,,0,
13,Racer (1910 roller coaster),"4,500 ft (1,400 m)",40 mph (64 km/h),Kennywood,Operating,1927,Wood – Racing,Charlie Mach,46 in (117 cm),Racing,...,40 mph,64 km/h,40.0,mph,40.0,72.5,ft,,0,
15,The Great Scenic Railway,"967 m (3,173 ft)",60 km/h (37 mph),Luna Park Melbourne,Operating,December 1912,Wood,LaMarcus Thompson,100 cm (3 ft 3 in),,...,60 km/h,37 mph,60.0,km/h,37.3,16.0,m,52.5,0,
22,Jack Rabbit (Kennywood),"2,132 ft (650 m)",45 mph (72 km/h),Kennywood,Operating,1920,Wood,Harry C. Baker,42 in (107 cm),,...,45 mph,72 km/h,45.0,mph,45.0,40.0,ft,,0,
23,Jack Rabbit (Seabreeze),"2,130 ft (650 m)",42 mph (68 km/h),Seabreeze Amusement Park,Operating,1920,Wood,Harry C. Baker,48 in (122 cm),,...,42 mph,68 km/h,42.0,mph,42.0,75.0,ft,,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1066,Monster (Gröna Lund),"2,296.6 ft (700.0 m)",55.9 mph (90.0 km/h),Gröna Lund,Operating,"June 2, 2021; 5 months ago",Steel – Inverted,Bolliger & Mabillard,140 cm (4 ft 7 in),Inverted Coaster,...,55.9 mph,90.0 km/h,55.9,mph,55.9,111.5,ft,,3,4.5
1068,Storm Chaser (Paultons Park),,,Paultons Park,Operating,12 April 2021,Steel – Spinning,Mack Rides,100 cm (3 ft 3 in),Spinning Coaster / Sierra Sidewinder,...,,,,,,65.6,ft,,0,
1069,Stunt Pilot (roller coaster),"1,800 ft (550 m)",52 mph (84 km/h),Silverwood Theme Park,Operating,"May 29, 2021",Steel – Single-rail,Rocky Mountain Construction,48 in (122 cm),Raptor – Custom,...,52 mph,84 km/h,52.0,mph,52.0,105.0,ft,,3,
1070,The Ride to Happiness,"3,018.4 ft (920.0 m)",55.9 mph (90.0 km/h),Plopsaland De Panne,Operating,"July 1, 2021",Steel – Spinning,Mack Rides,130 cm (4 ft 3 in),Xtreme Spinning Coaster,...,55.9 mph,90.0 km/h,55.9,mph,55.9,108.3,ft,,6,


In [28]:
#drop un needed columns. This data set is very large and for the focus project I do not need everything here. 
## Review of all the columns

df.columns.tolist()

['coaster_name',
 'Length',
 'Speed',
 'Location',
 'Status',
 'Opening date',
 'Type',
 'Manufacturer',
 'Height restriction',
 'Model',
 'Height',
 'Inversions',
 'Lift/launch system',
 'Cost',
 'Trains',
 'Park section',
 'Duration',
 'Capacity',
 'G-force',
 'Designer',
 'Max vertical angle',
 'Drop',
 'Soft opening date',
 'Fast Lane available',
 'Replaced',
 'Track layout',
 'Fastrack available',
 'Soft opening date.1',
 'Closing date',
 'Opened',
 'Replaced by',
 'Website',
 'Flash Pass Available',
 'Must transfer from wheelchair',
 'Theme',
 'Single rider line available',
 'Restraint Style',
 'Flash Pass available',
 'Acceleration',
 'Restraints',
 'Name',
 'year_introduced',
 'latitude',
 'longitude',
 'Type_Main',
 'opening_date_clean',
 'speed1',
 'speed2',
 'speed1_value',
 'speed1_unit',
 'speed_mph',
 'height_value',
 'height_unit',
 'height_ft',
 'Inversions_clean',
 'Gforce_clean']

In [29]:
#list of what columns I want to keep

# 'coaster_name', 'Length', 'Speed', 'Location', 'Status', 'Opening date','Type', 'Manufacturer', 'Model', 'Height',
#'Cost' 'Duration', 'Capacity', 'G-force', 'Designer','Theme','Acceleration', 'Restraints', 'Name',
#'latitude', 'longitude','opening_date_clean','speed_mph', 'height_ft', 'Inversions_clean', 'Gforce_clean'



In [30]:
cleaned_columns = df_cleaned.drop(columns=[
#'coaster_name',
#'Length',
#'Speed',
'Location',
#'Status',
 'Opening date',
 'Type',
 #'Manufacturer',
 'Height restriction',
 #'Model',
 #'Height',
 'Inversions',
 #'Lift/launch system',
 'Cost',
 'Trains',
 'Park section',
 'Duration',
 'Capacity',
 #'G-force',
 #'Designer',
 'Max vertical angle',
 'Drop',
 'Soft opening date',
 'Fast Lane available',
 'Replaced',
 #'Track layout',
 'Fastrack available',
 'Soft opening date.1',
 'Closing date',
 'Opened',
 'Replaced by',
 'Website',
 'Flash Pass Available',
 'Must transfer from wheelchair',
 'Theme',
 'Single rider line available',
 'Restraint Style',
 'Flash Pass available',
 'Acceleration',
 'Restraints',
 'Name',
 'year_introduced',
 'latitude',
 'longitude',
 #'Type_Main',
 #'opening_date_clean',
 'speed1',
 'speed2',
 'speed1_value',
 'speed1_unit',
 #'speed_mph',
 'height_value',
 'height_unit',
 #'height_ft',
 #'Inversions_clean',
 'Gforce_clean'])
cleaned_columns

Unnamed: 0,coaster_name,Length,Speed,Status,Manufacturer,Model,Height,Lift/launch system,G-force,Designer,Track layout,Type_Main,opening_date_clean,speed_mph,height_ft,Inversions_clean
6,Leap-The-Dips,"1,452 ft (443 m)",10 mph (16 km/h),Operating,Federal Construction Company,,41 ft (12 m),,,Edward Joy Morris,,Wood,1902-01-01,10.0,,0
13,Racer (1910 roller coaster),"4,500 ft (1,400 m)",40 mph (64 km/h),Operating,Charlie Mach,Racing,72.5 ft (22.1 m),Chain lift,,John A. Miller,Möbius Loop,Wood,1927-01-01,40.0,,0
15,The Great Scenic Railway,"967 m (3,173 ft)",60 km/h (37 mph),Operating,LaMarcus Thompson,,16 m (52 ft),Cable lift hill,,,,Wood,1912-12-01,37.3,52.5,0
22,Jack Rabbit (Kennywood),"2,132 ft (650 m)",45 mph (72 km/h),Operating,Harry C. Baker,,40 ft (12 m),Chain lift hill,,John A. Miller,"Terrain, Out and Back roller coaster",Wood,1920-01-01,45.0,,0
23,Jack Rabbit (Seabreeze),"2,130 ft (650 m)",42 mph (68 km/h),Operating,Harry C. Baker,,75 ft (23 m),chain,,John A. Miller,"Terrain, Out and Back",Wood,1920-01-01,42.0,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1066,Monster (Gröna Lund),"2,296.6 ft (700.0 m)",55.9 mph (90.0 km/h),Operating,Bolliger & Mabillard,Inverted Coaster,111.5 ft (34.0 m),Chain lift hill,4.5,,,Steel,2021-06-02,55.9,,3
1068,Storm Chaser (Paultons Park),,,Operating,Mack Rides,Spinning Coaster / Sierra Sidewinder,65.6 ft (20.0 m),Chain lift hill,,,,Steel,2021-04-12,,,0
1069,Stunt Pilot (roller coaster),"1,800 ft (550 m)",52 mph (84 km/h),Operating,Rocky Mountain Construction,Raptor – Custom,105 ft (32 m),Chain lift hill,,Alan Schilke,,Steel,2021-05-29,52.0,,3
1070,The Ride to Happiness,"3,018.4 ft (920.0 m)",55.9 mph (90.0 km/h),Operating,Mack Rides,Xtreme Spinning Coaster,108.3 ft (33.0 m),Two LSM launches,,,,Steel,2021-07-01,55.9,,6


In [31]:
cleaned_columns.columns.tolist()

['coaster_name',
 'Length',
 'Speed',
 'Status',
 'Manufacturer',
 'Model',
 'Height',
 'Lift/launch system',
 'G-force',
 'Designer',
 'Track layout',
 'Type_Main',
 'opening_date_clean',
 'speed_mph',
 'height_ft',
 'Inversions_clean']

In [32]:
df_cleaned

Unnamed: 0,coaster_name,Length,Speed,Location,Status,Opening date,Type,Manufacturer,Height restriction,Model,...,speed1,speed2,speed1_value,speed1_unit,speed_mph,height_value,height_unit,height_ft,Inversions_clean,Gforce_clean
6,Leap-The-Dips,"1,452 ft (443 m)",10 mph (16 km/h),Lakemont Park,Operating,1902,Wood – Side friction,Federal Construction Company,,,...,10 mph,16 km/h,10.0,mph,10.0,41.0,ft,,0,
13,Racer (1910 roller coaster),"4,500 ft (1,400 m)",40 mph (64 km/h),Kennywood,Operating,1927,Wood – Racing,Charlie Mach,46 in (117 cm),Racing,...,40 mph,64 km/h,40.0,mph,40.0,72.5,ft,,0,
15,The Great Scenic Railway,"967 m (3,173 ft)",60 km/h (37 mph),Luna Park Melbourne,Operating,December 1912,Wood,LaMarcus Thompson,100 cm (3 ft 3 in),,...,60 km/h,37 mph,60.0,km/h,37.3,16.0,m,52.5,0,
22,Jack Rabbit (Kennywood),"2,132 ft (650 m)",45 mph (72 km/h),Kennywood,Operating,1920,Wood,Harry C. Baker,42 in (107 cm),,...,45 mph,72 km/h,45.0,mph,45.0,40.0,ft,,0,
23,Jack Rabbit (Seabreeze),"2,130 ft (650 m)",42 mph (68 km/h),Seabreeze Amusement Park,Operating,1920,Wood,Harry C. Baker,48 in (122 cm),,...,42 mph,68 km/h,42.0,mph,42.0,75.0,ft,,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1066,Monster (Gröna Lund),"2,296.6 ft (700.0 m)",55.9 mph (90.0 km/h),Gröna Lund,Operating,"June 2, 2021; 5 months ago",Steel – Inverted,Bolliger & Mabillard,140 cm (4 ft 7 in),Inverted Coaster,...,55.9 mph,90.0 km/h,55.9,mph,55.9,111.5,ft,,3,4.5
1068,Storm Chaser (Paultons Park),,,Paultons Park,Operating,12 April 2021,Steel – Spinning,Mack Rides,100 cm (3 ft 3 in),Spinning Coaster / Sierra Sidewinder,...,,,,,,65.6,ft,,0,
1069,Stunt Pilot (roller coaster),"1,800 ft (550 m)",52 mph (84 km/h),Silverwood Theme Park,Operating,"May 29, 2021",Steel – Single-rail,Rocky Mountain Construction,48 in (122 cm),Raptor – Custom,...,52 mph,84 km/h,52.0,mph,52.0,105.0,ft,,3,
1070,The Ride to Happiness,"3,018.4 ft (920.0 m)",55.9 mph (90.0 km/h),Plopsaland De Panne,Operating,"July 1, 2021",Steel – Spinning,Mack Rides,130 cm (4 ft 3 in),Xtreme Spinning Coaster,...,55.9 mph,90.0 km/h,55.9,mph,55.9,108.3,ft,,6,


In [33]:
#reviewing null values
df_cleaned.isna().sum()

coaster_name                       0
Length                            52
Speed                             60
Location                           0
Status                             0
Opening date                      11
Type                               0
Manufacturer                      24
Height restriction                97
Model                            205
Height                            52
Inversions                        65
Lift/launch system               148
Cost                             388
Trains                           184
Park section                     303
Duration                         164
Capacity                         282
G-force                          431
Designer                         330
Max vertical angle               423
Drop                             344
Soft opening date                592
Fast Lane available              607
Replaced                         554
Track layout                     442
Fastrack available               649
S

In [34]:
# I am creating a new workbook to review all the null values

In [35]:
duplicates = [df_cleaned.duplicated(keep=False)]
print(duplicates)
#no duplicates


[6       False
13      False
15      False
22      False
23      False
        ...  
1066    False
1068    False
1069    False
1070    False
1071    False
Length: 668, dtype: bool]


In [38]:
cleaned_columns.columns.tolist()

['coaster_name',
 'Length',
 'Speed',
 'Status',
 'Manufacturer',
 'Model',
 'Height',
 'Lift/launch system',
 'G-force',
 'Designer',
 'Track layout',
 'Type_Main',
 'opening_date_clean',
 'speed_mph',
 'height_ft',
 'Inversions_clean']

In [42]:
df_cleaned = cleaned_columns
df_cleaned

Unnamed: 0,coaster_name,Length,Speed,Status,Manufacturer,Model,Height,Lift/launch system,G-force,Designer,Track layout,Type_Main,opening_date_clean,speed_mph,height_ft,Inversions_clean
6,Leap-The-Dips,"1,452 ft (443 m)",10 mph (16 km/h),Operating,Federal Construction Company,,41 ft (12 m),,,Edward Joy Morris,,Wood,1902-01-01,10.0,,0
13,Racer (1910 roller coaster),"4,500 ft (1,400 m)",40 mph (64 km/h),Operating,Charlie Mach,Racing,72.5 ft (22.1 m),Chain lift,,John A. Miller,Möbius Loop,Wood,1927-01-01,40.0,,0
15,The Great Scenic Railway,"967 m (3,173 ft)",60 km/h (37 mph),Operating,LaMarcus Thompson,,16 m (52 ft),Cable lift hill,,,,Wood,1912-12-01,37.3,52.5,0
22,Jack Rabbit (Kennywood),"2,132 ft (650 m)",45 mph (72 km/h),Operating,Harry C. Baker,,40 ft (12 m),Chain lift hill,,John A. Miller,"Terrain, Out and Back roller coaster",Wood,1920-01-01,45.0,,0
23,Jack Rabbit (Seabreeze),"2,130 ft (650 m)",42 mph (68 km/h),Operating,Harry C. Baker,,75 ft (23 m),chain,,John A. Miller,"Terrain, Out and Back",Wood,1920-01-01,42.0,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1066,Monster (Gröna Lund),"2,296.6 ft (700.0 m)",55.9 mph (90.0 km/h),Operating,Bolliger & Mabillard,Inverted Coaster,111.5 ft (34.0 m),Chain lift hill,4.5,,,Steel,2021-06-02,55.9,,3
1068,Storm Chaser (Paultons Park),,,Operating,Mack Rides,Spinning Coaster / Sierra Sidewinder,65.6 ft (20.0 m),Chain lift hill,,,,Steel,2021-04-12,,,0
1069,Stunt Pilot (roller coaster),"1,800 ft (550 m)",52 mph (84 km/h),Operating,Rocky Mountain Construction,Raptor – Custom,105 ft (32 m),Chain lift hill,,Alan Schilke,,Steel,2021-05-29,52.0,,3
1070,The Ride to Happiness,"3,018.4 ft (920.0 m)",55.9 mph (90.0 km/h),Operating,Mack Rides,Xtreme Spinning Coaster,108.3 ft (33.0 m),Two LSM launches,,,,Steel,2021-07-01,55.9,,6


In [None]:
#review what values are in the lift/launch column 
## results are very problematic, need to clean to just contain "lift" / "launch"
df_cleaned['Lift/launch system'].unique()

array([nan, 'Chain lift', 'Cable lift hill', 'Chain lift hill', 'chain',
       'Cable (two lifts)', 'Chain', 'Chain-lift', 'Chain Lift Hill',
       'Cable-lift', 'Chainlift', 'Three chain lift hills',
       'Two Chain lift hills',
       'Trains are powered, propelling themselves on uphill sections. Trains then freely roll over drops. Tires embedded in the track also move trains through certain sections.',
       'Two chain lift hills', 'Chain Lift', 'chain lift hill',
       'Flywheel launch', '2 Chain lift hills', 'Powered coaster',
       'Vertical Chain lift', 'Spiral lift', 'Drive tire lift hill',
       'Cable and Chain lift hill on both towers', 'Friction Wheels',
       'Onboard motors', 'Powered', 'Tire lift hill',
       'Two Drive Tire Lifts',
       'Electric Winch Launch / Booster wheels (second lift)',
       'Lift hill', 'Linear synchronous motor', 'Drive tire',
       'tyre-driven lift-hill', 'Tire Propelled Launch', 'LIM launch',
       'LIM Launch', 'friction drive

In [54]:
# Filter to only include rows with 'lift', 'launch', or 'lsm'
df_cleaned = df_cleaned[
    df_cleaned['Lift/launch system'].str.contains('lift|launch|lsm', case=False, na=False)
]

# Simplify values to just 'lift' or 'launch'
def simplify_system(value):
    value = value.lower()
    if "launch" in value or "lsm" in value or "flywheel" in value:
        return "launch"
    elif "lift" in value or "chain" in value or "tire" in value or "cable" in value or "powered" in value or "drive" in value:
        return "lift"
    else:
        return np.nan

df_cleaned['Lift/launch system'] = df_cleaned['Lift/launch system'].apply(simplify_system)

# Reset index
df_cleaned = df_cleaned.reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['Lift/launch system'] = df_cleaned['Lift/launch system'].apply(simplify_system)


In [55]:
df_cleaned

Unnamed: 0,coaster_name,Length,Speed,Status,Manufacturer,Model,Height,Lift/launch system,G-force,Designer,Track layout,Type_Main,opening_date_clean,speed_mph,height_ft,Inversions_clean
0,Racer (1910 roller coaster),"4,500 ft (1,400 m)",40 mph (64 km/h),Operating,Charlie Mach,Racing,72.5 ft (22.1 m),lift,,John A. Miller,Möbius Loop,Wood,1927-01-01,40.0,,0
1,The Great Scenic Railway,"967 m (3,173 ft)",60 km/h (37 mph),Operating,LaMarcus Thompson,,16 m (52 ft),lift,,,,Wood,1912-12-01,37.3,52.5,0
2,Jack Rabbit (Kennywood),"2,132 ft (650 m)",45 mph (72 km/h),Operating,Harry C. Baker,,40 ft (12 m),lift,,John A. Miller,"Terrain, Out and Back roller coaster",Wood,1920-01-01,45.0,,0
3,Scenic Railway (roller coaster),"3,000 ft (910 m)",35 mph (56 km/h),Operating,,Scenic Railway,40 ft (12 m),lift,,,,Wood,1920-07-03,35.0,,0
4,Roller Coaster (Lagoon),"762 m (2,500 ft)",45 mph (72 km/h),Operating,,,18.9 m (62 ft),lift,,John A. Miller,Double Out and Back,Wood,1921-07-15,45.0,62.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
451,Monster (Gröna Lund),"2,296.6 ft (700.0 m)",55.9 mph (90.0 km/h),Operating,Bolliger & Mabillard,Inverted Coaster,111.5 ft (34.0 m),lift,4.5,,,Steel,2021-06-02,55.9,,3
452,Storm Chaser (Paultons Park),,,Operating,Mack Rides,Spinning Coaster / Sierra Sidewinder,65.6 ft (20.0 m),lift,,,,Steel,2021-04-12,,,0
453,Stunt Pilot (roller coaster),"1,800 ft (550 m)",52 mph (84 km/h),Operating,Rocky Mountain Construction,Raptor – Custom,105 ft (32 m),lift,,Alan Schilke,,Steel,2021-05-29,52.0,,3
454,The Ride to Happiness,"3,018.4 ft (920.0 m)",55.9 mph (90.0 km/h),Operating,Mack Rides,Xtreme Spinning Coaster,108.3 ft (33.0 m),launch,,,,Steel,2021-07-01,55.9,,6


In [56]:
df_cleaned['Lift/launch system'].unique()

array(['lift', 'launch'], dtype=object)