## İmport all required libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns

plt.style.use("ggplot")

df = pd.read_csv("coaster_db.csv")

## Data understanding
```shape, head, tail, dtypes, describe```

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
df.describe()

In [None]:
df = df[['coaster_name',
    # 'Length', 'Speed',
    'Location', 'Status',
    # 'Opening date',
    # 'Type',
    'Manufacturer',
    # 'Height restriction', 'Model', 'Height',
     # 'Inversions', 'Lift/launch system', 'Cost', 'Trains', 'Park section',
     # 'Duration', 'Capacity', 'G-force', 'Designer', 'Max vertical angle',
     # 'Drop', 'Soft opening date', 'Fast Lane available', 'Replaced',
     # 'Track layout', 'Fastrack available', 'Soft opening date.1',
     # 'Closing date', 'Opened',
    #'Replaced by', 'Website',
     # 'Flash Pass Available', 'Must transfer from wheelchair', 'Theme',
     # 'Single rider line available', 'Restraint Style',
     # 'Flash Pass available', 'Acceleration', 'Restraints', 'Name',
     'year_introduced', 'latitude', 'longitude',
     'Type_Main',
     'opening_date_clean',
    #'speed1', 'speed2', 'speed1_value', 'speed1_unit',
     'speed_mph',
    # 'height_value', 'height_unit',
     'height_ft',
     'Inversions_clean', 'Gforce_clean']].copy()

In [None]:
df["opening_date_clean"] = pd.to_datetime(df['opening_date_clean'])

# Rename columns

In [None]:
df.rename(columns={"opening_date_clean": "Opening_Date",
                   "coaster_name": "Coaster_Name",
                   "opening_date_datetime": "Opening_Datetime",
                   "speed_mph": "Speed",
                   "height_ft": "Height",
                   "Inversions_clean": "Inversions",
                   "year_introduced": "Year_Introduced",
                   "Gforce_clean": "gForce"}, inplace=True)

In [None]:
df.isna().sum()

In [None]:
df.loc[df.duplicated()]

In [None]:
df.loc[df.duplicated(subset=["Coaster_Name"])].head()

In [None]:
# Checking an example of duplicated rows

df.query('Coaster_Name == "Crystal Beach Cyclone"')

In [None]:
df = df.loc[~df.duplicated(subset=["Coaster_Name","Location", "Opening_Date"])] \
    .reset_index(drop=True).copy()

In [None]:
df.shape

# Feature Understanding
- Plotting Feature Distributions
  - Histogram, KDE, Boxplot


In [None]:
ax = df["Year_Introduced"].value_counts() \
    .head(10) \
    .plot(kind="barh", figsize=(8, 5), title="Top 10 Year Introduced")
ax.set_xlabel("Year Introduced")
ax.set_ylabel("Count")

In [None]:
ax = df["Speed"].plot(kind="hist", bins=20, figsize=(8, 5), title="Speed Distribution")
ax.set_xlabel("Speed(mph)")

In [None]:
ax = df["Speed"].plot(kind="kde", figsize=(8, 5), title="Speed Distribution")
ax.set_xlabel("Speed(mph)")

# Feature relations
- Scatterplot, Heatmap Correlation, Pairplot, Groupby comparison

In [None]:
df.plot(kind="scatter", x="Speed", y="Height", figsize=(8, 5), title="Height vs Speed")
plt.show() # to remove the text above the plot

In [None]:
sns.scatterplot(x="Speed", y="Height", data=df, hue="Year_Introduced")

In [None]:
sns.pairplot(df, vars=["Speed", "Height", "Inversions", "Year_Introduced", "gForce"], hue="Type_Main")
plt.show()

In [32]:
# What are the fastest roller coasters?

df.query("Location != 'Other'") \
    .groupby("Location")["Speed"].agg(["mean", "count"]).query('count >= 10').sort_values(by="mean", ascending=False).head(10)

Unnamed: 0_level_0,mean,count
Location,Unnamed: 1_level_1,Unnamed: 2_level_1
Busch Gardens Williamsburg,58.318182,11
Cedar Point,57.833333,18
Six Flags Magic Mountain,57.241176,17
Canada's Wonderland,53.533333,12
Six Flags Great Adventure,53.036364,11
Kings Dominion,52.083333,12
Hersheypark,50.576923,13
Kings Island,49.273684,19
Carowinds,43.571429,14
Alton Towers,42.791667,12
