# Authenticating with Kaggle using kaggle.json

Navigate to https://www.kaggle.com. 
Then go to the [Account tab of your user profile](https://www.kaggle.com/me/account) and select Create API Token. 
This will trigger the download of `kaggle.json`, a file containing your API credentials.

Drag the `kaggle.json` file you downloaded on your local machine 
to the `~/mlops-zoomcamp-project` on the remote machine.

In [1]:
# Let's make sure the kaggle.json file is present.
!ls -lha ~/mlops-zoomcamp-project/kaggle.json

-rw-rw-r-- 1 ubuntu ubuntu 64 Jul 13 19:08 /home/ubuntu/mlops-zoomcamp-project/kaggle.json


In [2]:
# Copy this file to ~/.kaggle
!rm -rf ~/.kaggle/
!mkdir -p ~/.kaggle/
!cp ~/mlops-zoomcamp-project/kaggle.json ~/.kaggle

# Change the permission to avoid a warning when starting the Kaggle tool.
!chmod 600 ~/.kaggle/kaggle.json

In [3]:
import kaggle

!kaggle --version

Kaggle API 1.5.15


In [4]:
import kaggle

api = kaggle.api
print(api.get_config_value("username"))
print(api.get_default_download_dir())

boisalai
/home/ubuntu/mlops-zoomcamp-project/notebooks


In [5]:
!kaggle competitions list
# !kaggle datasets download -d rounakbanik/the-movies-dataset

403 - Forbidden


# Used Car Price Prediction

This [data](https://www.kaggle.com/datasets/austinreese/craigslist-carstrucks-data) contains 
most all relevant information that Craigslist provides on car sales including 
columns like price, condition, manufacturer, latitude/longitude, and 18 other categories.

This notebook was built from the following:

* https://www.kaggle.com/code/maciejautuch/car-price-prediction
* https://www.kaggle.com/code/hemprakashprasanna/used-car-price-prediction

# Download the dataset

In [36]:
# Kaggle URL dataset
# https://www.kaggle.com/datasets/austinreese/craigslist-carstrucks-data
DATASET = 'austinreese/craigslist-carstrucks-data'
FILE_NAME = 'vehicles.csv'
PATH = '/home/ubuntu/mlops-zoomcamp-project/data'

In [37]:
import kaggle

try:
    kaggle.api.authenticate()
    kaggle.api.dataset_download_file(DATASET, FILE_NAME, path=PATH)
except kaggle.api.rest.ApiException as exception:
    print(exception)

In [15]:
import pandas as pd

df = pd.read_csv(f'{PATH}/{FILE_NAME}.zip')

FileNotFoundError: [Errno 2] No such file or directory: '/home/ubuntu/mlops-zoomcamp-project/data/vehicles.csv.zip'

# Feature engineering

In [None]:
df.columns

In [None]:
df.head(3)

## Missing Values

In [None]:
# Calculate the percentage of nulls in each of the above features.
nulls_perc = df.isna().sum()/len(df)*100
nulls_perc[nulls_perc.values>0].sort_values(ascending=False)

In [None]:
# Drop 'county' and 'size' features which have more than 50% of their data missing
df.drop(['county','size'], axis = 'columns', inplace = True)

In [None]:
# Get the features which have less than 5% of their data missing
lst = nulls_perc[(nulls_perc.values>0) & (nulls_perc.values<5)].sort_values(ascending=False).index
lst

In [None]:
# Drop the rows in the above features that have missing values
for features in lst:
    df.dropna(subset=[features], inplace=True, axis='index')

In [None]:
# Re-Calculate the percentage of nulls in each of the above features
nulls_perc = df.isna().sum()/len(df)*100
nulls_perc[nulls_perc.values>0].sort_values(ascending=False)

In [None]:
# Others columns with missing.
def value_counts(column: str) -> None:
    print(f"Column name: {column}")
    print(df[column].value_counts(dropna=False))
    print(" ")
    
value_counts('cylinders')
value_counts('condition')
value_counts('VIN')
value_counts('drive')
value_counts('paint_color')
value_counts('type')

In [None]:
# Stripping the word 'cylinders' from the 'cylinders' feature.
df['cylinders'] = df['cylinders'].replace('cylinders','',regex=True)
df['cylinders'] = df['cylinders'].str.strip()
df['cylinders'].value_counts(dropna=False)

In [None]:
# Drop 'VIN' column which is useless.
df.drop(['VIN'], axis = 'columns', inplace = True)

In [None]:
# Fill missing values in the ratio of non-null values in the feature.
import numpy as np

def fill_missing(column: str) -> None:
    counts = df[column].value_counts(normalize=True)
    df[column] = df[column].fillna(
        pd.Series(np.random.choice(
            list(counts.index), p=list(counts.values), size=len(df)
        ))
    )
    value_counts('paint_color')

fill_missing('paint_color')

In [None]:
# Dropping all the rows that contain missing values
df.dropna(axis='index', inplace=True)
df.shape

## Outliers

In [None]:
# understanding the data structure and looking at car price distribution
lower_limit = np.percentile(df[['price']], 5)
upper_limit = np.percentile(df[['price']], 95)
print(lower_limit, upper_limit)

In [None]:
# Removing outliers
df = df[(df['price'] >= lower_limit) & (df['price'] <= upper_limit)]  

## Create new features

In [None]:
df['posting_year'] = df['posting_date'].str[0:4].astype('int64')
df['years_used'] = df['posting_year'] - df['year']

# Changing year for a smaller number.
df['year'] = df['year'].astype('int64') - 1900

## Label encoder

In [None]:
# Categorical data encoding - label enncoding
df['title_status'].unique()

In [None]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()
df['title_status'] = label_encoder.fit_transform(df['title_status'])

## Data Cleaning

In [None]:
# Checking
df['years_used'].unique()

In [None]:
# There is a -1 value in the years_used feature. This may have happened due to some error during listing.
df = df[df.years_used > -1]

In [None]:
df.info()

In [None]:
# Remove others columns that will not be used.
df.drop(columns=['id', 'url', 'region', 'region_url', 
                 'image_url', 'description',
                 'lat', 'long', 'posting_date'], axis=1, inplace=True)
df.shape

In [None]:
df.describe()

In [None]:
df.head(3)

# Model fit

In [None]:
# Split features and label.
x = df.drop(columns=['price','model','state']) 
y = df[['price']] 

In [None]:
# Categorical data encoding.
x = pd.get_dummies(x)
x.shape

In [None]:
# Split the data into train and test.
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42) 

In [None]:
print(f"x_train {x_train.shape}")
print(f"x_test {x_test.shape}")
print(f"y_train {y_train.shape}")
print(f"y_test {y_test.shape}")

In [None]:
# Model 1.
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(x_train, y_train)
lm.score(x_train,y_train), lm.score(x_test,y_test)

In [None]:
# Model 2.
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(x_train, y_train.values.ravel())
gnb.score(x_train,y_train), gnb.score(x_test,y_test)

In [None]:
# Model 3.
from sklearn.neighbors import KNeighborsRegressor
neigh = KNeighborsRegressor()
neigh.fit(x_train, y_train)
neigh.score(x_train,y_train), neigh.score(x_test,y_test)

In [None]:
# Model 4.
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2)
poly.fit(x_train, x_test)

x_train_poly = poly.transform(x_train)
x_test_poly = poly.transform(x_test)

lm.fit(x_train_poly, y_train)
lm.score(x_train_poly,y_train), lm.score(x_test_poly, y_test)

In [None]:
# Model 5.
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor(random_state=0, max_depth=1000, 
                            min_samples_split = 18, min_impurity_decrease = 1.4)
dtr.fit(x_train, y_train.values.ravel())
dtr.score(x_train,y_train), dtr.score(x_test,y_test)

In [None]:
# Model 6.
from sklearn.ensemble import RandomForestRegressor
random_forest = RandomForestRegressor(n_estimators = 250, max_features = 'sqrt', n_jobs = 20)
random_forest.fit(x_train, y_train.values.ravel())
print(random_forest.score(x_train, y_train), random_forest.score(x_test, y_test))

In [None]:
# Model 7.
from sklearn.ensemble import BaggingRegressor
bagging = BaggingRegressor(n_estimators = 200, oob_score = True, n_jobs = 10)
bagging.fit(x_train, y_train.values.ravel())
bagging.score(x_train,y_train), bagging.score(x_test,y_test)

In [None]:
# Model 8.
from sklearn.ensemble import ExtraTreesRegressor
etr = ExtraTreesRegressor(random_state=0, n_estimators = 250, max_features = None, min_samples_split = 6)
etr.fit(x_train, y_train.values.ravel())
etr.score(x_train,y_train), etr.score(x_test,y_test)

# Choosing the next best algorithm

In [None]:
from sklearn.linear_model import LinearRegression, ElasticNet, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

def create_models():
    models = []
    models.append(('Linear Regression', LinearRegression()))
    models.append(('Decision Tree Regressor', DecisionTreeRegressor()))
    models.append(('ElasticNet_Regressor', ElasticNet()))
    models.append(('Lasso_Regressor', Lasso()))
    models.append(('Ridge_Regressor', Ridge()))
    models.append(('RandomForest_Regressor', RandomForestRegressor()))
    return models

# creating a list with all the algorithms we are going to assess
models = create_models()

In [None]:
from sklearn.metrics import r2_score, mean_squared_error

for name, model in models:
    print(" ")
    print(name)
    model.fit(x_train, y_train)
    y_pred_train = model.predict(x_train)
    y_pred_test = model.predict(x_test)
    print('Train R2 :', r2_score(y_train, y_pred_train))
    print('Test R2 :', r2_score(y_test, y_pred_test))
    print('Train RMSE :', np.sqrt(mean_squared_error(y_train, y_pred_train)))
    print('Test RMSE :', np.sqrt(mean_squared_error(y_test, y_pred_test)))


# Hyperparameter tuning for Random Forest Regressor

In [None]:
from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
from sklearn.model_selection import RandomizedSearchCV

grid_parameters = {'n_estimators': [80, 90, 100, 110],'max_depth': [5, 6],
                   'max_features': [None, 'auto'], 'min_samples_split': [2, 3]}

random_search = RandomizedSearchCV(
    estimator=RandomForestRegressor(random_state=1),
    param_distributions=grid_parameters,
    cv=5,n_iter=10,n_jobs=-1)

random_search.fit(x_train, y_train)
print(random_search.best_params_)

In [None]:
model = RandomForestRegressor(n_estimators=random_search.best_params_.get('n_estimators'),
                            max_depth=random_search.best_params_.get('max_depth'),
                            min_samples_split=random_search.best_params_.get('min_samples_split'),
                            max_features=random_search.best_params_.get('max_features'),
                            random_state=1)

model.fit(x_train, y_train)
y_pred_train = model.predict(x_train)
y_pred_test = model.predict(x_test)
print('Train R2 :', r2_score(y_train, y_pred_train))
print('Test R2 :', r2_score(y_test, y_pred_test))
print('Train RMSE :', np.sqrt(mean_squared_error(y_train, y_pred_train)))
print('Test RMSE :', np.sqrt(mean_squared_error(y_test, y_pred_test)))

# XGBoost Algorithm

In [None]:
from xgboost import XGBRegressor

model = XGBRegressor(random_state=1).fit(x_train,y_train)
y_pred_train = model.predict(x_train)
y_pred_test = model.predict(x_test)
print('Train R2 :', r2_score(y_train, y_pred_train))
print('Test R2 :', r2_score(y_test, y_pred_test))
print('Train RMSE :', np.sqrt(mean_squared_error(y_train, y_pred_train)))
print('Test RMSE :', np.sqrt(mean_squared_error(y_test, y_pred_test)))

# Hyperparameter Tuning for XGBoost Algorithm

In [None]:
tuning_params = {'learning_rate':[0.1,0.2,0.3,0.4,0.5,0.6],
                 'max_depth':range(3,10),
                 'gamma':[0,1,2,3,4]}
xgb_search = RandomizedSearchCV(
    estimator=XGBRegressor(),
    param_distributions=tuning_params,
    cv=5,n_iter=10,n_jobs=1)

xgb_search.fit(x_train, y_train)
print(xgb_search.best_params_)

In [None]:
model = XGBRegressor(learning_rate=0.6,max_depth=9,gamma=4).fit(x_train, y_train)
y_pred_train = model.predict(x_train)
y_pred_test = model.predict(x_test)
print('Train R2 :', r2_score(y_train, y_pred_train))
print('Test R2 :', r2_score(y_test, y_pred_test))
print('Train RMSE :', np.sqrt(mean_squared_error(y_train, y_pred_train)))
print('Test RMSE :', np.sqrt(mean_squared_error(y_test, y_pred_test)))

# Save Dataframe

In [None]:
df.to_parquet(f'{PATH}/{FILE_NAME}', index=False)