In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pylab as plt
import seaborn as sns

In [6]:
plt.style.use('ggplot')
pd.set_option('max_columns',200)

In [3]:
df = pd.read_csv('../input/rollercoaster-database/coaster_db.csv')

In [4]:
df.shape

In [7]:
df.head()

In [8]:
df.columns

In [9]:
df.dtypes 

In [10]:
df.describe()

# Data Preparation

In [12]:
df = df[['coaster_name',
   'Location',
   'Status',
   'Manufacturer',
   'year_introduced',
   'latitude',
   'longitude',
   'Type_Main',
   'opening_date_clean',
   'speed_mph',
   'height_ft',
   'Inversions_clean',
   'Gforce_clean']]

In [None]:
# Example of dropping columns
# df.drop(['Opening date'], axis=1)

In [14]:
df['opening_date_clean'] = pd.to_datetime(df['opening_date_clean'])

In [15]:
df['opening_date_clean']

# Rename columns

In [17]:
df = df.rename(columns={
    'coaster_name':'Coaster_Name',
    'year_introduced':'Year_Introduced',
    'opening_date_clean':'Opening_Date',
    'speed_mph':'Speed_mph',
    'height_ft':'Height_ft',
    'Inversions_clean':'Inversions',
    'Gforce_clean':'Gforce'
})

In [18]:
df.isna()

In [19]:
df.isna().sum()

In [21]:
df.loc[df.duplicated()]

In [22]:
df.loc[df.duplicated(subset=['Coaster_Name'])]

In [24]:
df.query('Coaster_Name=="Crystal Beach Cyclone"')

In [26]:
df = df.loc[~df.duplicated(subset=['Coaster_Name','Location','Opening_Date'])] \
    .reset_index(drop=True).copy()

# Feature Understanding

In [28]:
df['Year_Introduced'].value_counts()

In [31]:
df['Year_Introduced'].value_counts() \
    .head(10) \
    .plot(kind='bar',title='Top Years Coasters Introduced')

In [34]:
ax = df['Year_Introduced'].value_counts() \
    .head(10) \
    .plot(kind='bar',title='Top Years Coasters Introduced')
ax.set_xlabel('Year Introduced')

In [36]:
ax = df['Year_Introduced'].value_counts() \
    .head(10) \
    .plot(kind='bar', title='Top Years Coasters Introduced')
ax.set_xlabel('Year Introduced')
ax.set_ylabel('Count')

In [41]:
ax = df['Speed_mph'].plot(kind='hist',
                          bins=20,
                          title='Coaster Speed (mph)')
ax.set_xlabel('Speed (mph)')

In [43]:
ax = df['Speed_mph'].plot(kind='kde',
                          title='Coaster Speed (mph)')
ax.set_xlabel('Speed (mph)')

# Features Relation

In [46]:
df.plot(kind='scatter',
        x='Speed_mph',
        y='Height_ft',
        title='Coaster Speed vs. Height')

In [47]:
df.plot(kind='scatter',
        x='Speed_mph',
        y='Height_ft',
        title='Coaster Speed vs. Height')
plt.show()

In [48]:
sns.scatterplot(x='Speed_mph',
                y='Height_ft',
                hue='Year_Introduced',
                data=df)

In [49]:
sns.pairplot(df, vars=['Year_Introduced','Speed_mph',
                      'Height_ft','Inversions','Gforce'])
plt.show()

In [51]:
sns.pairplot(df, vars=['Year_Introduced','Speed_mph',
                      'Height_ft','Inversions','Gforce'],
            hue='Type_Main')
plt.show()

In [53]:
df_corr = df[['Year_Introduced','Speed_mph',
    'Height_ft','Inversions','Gforce']].dropna().corr()
df_corr

In [55]:
sns.heatmap(df_corr, annot=True)

# Ask Questions
- Try to answer a question you have about the data using a plot or statistics

What are the locations with the fastest roller coasters (mininum of 10)?

In [63]:
ax = df.query('Location != "Other"') \
    .groupby('Location')['Speed_mph'] \
    .agg(['mean','count']) \
    .query('count >= 10') \
    .sort_values('mean')['mean'] \
    .plot(kind='barh',figsize=(12,5),
          title='Average Coaster Speed by Location')
ax.set_xlabel('Average Coaster Speed')
plt.show()