# Packages

In [1]:
import pandas as pd
from sklearn.model_selection import cross_val_score, train_test_split
import zipfile
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor


In [2]:
with zipfile.ZipFile("../data/playground-series-s4e9.zip") as z:
    with z.open("sample_submission.csv") as f:
        sample_submission = pd.read_csv(f)
    with z.open("test.csv") as f:
        test = pd.read_csv(f)
    with z.open("train.csv") as f:
        train = pd.read_csv(f)

In [None]:
train.info()

In [None]:
test.info()

In [5]:
selected_cols = ["model_year","milage","price"]

In [None]:
sns.pairplot(train[selected_cols])

In [None]:
sns.pairplot(train[selected_cols+["accident"]],hue="accident")

In [None]:
train.plot(kind="scatter", x="model_year", y="price", grid=True, alpha=0.01)
plt.show()

In [None]:
train.describe(include="all")

In [None]:
sns.boxplot(y=train["model_year"])

In [15]:
quartiles = train["model_year"].quantile([0.25,0.5,0.75])

In [None]:
iqr = quartiles[0.75]- quartiles[0.25]
iqr

In [None]:
delta = (train["model_year"].max()-2013+1.5*iqr)/5
delta

In [18]:
bins = [float(2024-delta*(5-i)) for i in range(6)]

In [19]:
bins[0], bins[5] = -np.inf, np.inf

In [None]:
bins

In [21]:
train["model_cat"] = pd.cut(train["model_year"],
                               bins=bins,
                               labels=[1, 2, 3, 4, 5])

In [22]:
strat_train_set, strat_test_set = train_test_split(
    train, test_size=0.2, stratify=train["model_cat"], random_state=42)

In [23]:
strat_train_set, strat_dev_set = train_test_split(
    strat_train_set, test_size=0.2, stratify=strat_train_set["model_cat"], random_state=42)

In [None]:
strat_train_set.plot(kind="scatter", x="model_year", y="price", grid=True, alpha=0.01)
plt.show()

In [None]:
strat_train_set.plot(kind="scatter", x="model_year", y="price", grid=True, alpha=0.09)
plt.show()

In [None]:
strat_train_set.plot(kind="scatter", x="model_year", y="price", grid=True, alpha=0.2)
plt.show()

In [None]:
strat_train_set.plot(kind="scatter", x="model_year", y="price", grid=True, alpha=1)
plt.show()

In [None]:
sns.boxplot(y=strat_train_set["price"])

In [None]:
strat_train_set["price"].describe()

In [None]:
model_year_q0 = float(quartiles[0.25]-1.5*iqr)
model_year_q0

In [None]:
strat_train_set[strat_train_set["model_year"]>= model_year_q0].count()/strat_train_set.count()

In [None]:
strat_train_set[strat_train_set["model_year"]>= 1990].count()/strat_train_set.count()

In [35]:
strat_train_set =  strat_train_set[(strat_train_set["model_year"]>= 1990)]

In [None]:
sns.boxplot(y=strat_train_set["price"])

In [None]:
sns.boxplot(y=strat_train_set["model_year"])

In [None]:
strat_train_set.plot(kind="scatter", x="model_year", y="price", grid=True, alpha=0.002)
plt.show()

In [21]:
std_scaler = StandardScaler()
X_scaled = std_scaler.fit_transform(strat_train_set[["model_year"]])

In [22]:
y = strat_train_set["price"].copy()

In [None]:
y.mean()

In [25]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())


# Train models

In [26]:

lin_reg = LinearRegression()
tree_reg = DecisionTreeRegressor(random_state=42)
forest_reg = RandomForestRegressor(n_estimators=100, random_state=42)

In [27]:
linear_rmses = -cross_val_score(lin_reg, X_scaled, y,
                              scoring="neg_root_mean_squared_error", cv=10)

In [None]:
display_scores(linear_rmses)

In [29]:
tree_rmses = -cross_val_score(tree_reg, X_scaled, y,
                              scoring="neg_root_mean_squared_error", cv=10)

In [None]:
display_scores(tree_rmses)

In [31]:
rf_rmses = -cross_val_score(forest_reg, X_scaled, y,
                              scoring="neg_root_mean_squared_error", cv=10)

In [None]:
display_scores(rf_rmses)

In [None]:
forest_reg.fit(X_scaled, y)
y_hat = forest_reg.predict(X_scaled)
forest_rmse = mean_squared_error(y, y_hat,
                                 squared=False)
forest_rmse

In [34]:
X_scaled_dev = std_scaler.transform(strat_dev_set[["model_year"]])

In [35]:
y_dev = strat_dev_set["price"].copy()

In [None]:
y_dev_hat = forest_reg.predict(X_scaled_dev)
forest_rmse = mean_squared_error(y_dev, y_dev_hat,
                                 squared=False)
forest_rmse