# CAS BDAI CUP SUBMISSION NOTEBOOK



# Import Data

In [None]:
#import data
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

train = pd.read_csv("https://github.com/casbdai/notebooks/raw/main/Module3/99_CAS_BDAI_CUP/train.csv")
test = pd.read_csv("https://github.com/casbdai/notebooks/raw/main/Module3/99_CAS_BDAI_CUP/test.csv")

In [None]:
train.info()

In [None]:
test.info()

In [None]:
#encode categorical data
train = pd.get_dummies(train, drop_first=True)
test = pd.get_dummies(test, drop_first=True)

# Build Model

In [None]:
# Import Functions
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error

# Instantiate Model
model = LinearRegression()

# Create Train Data
X = train.drop("price", axis=1)
y = train["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12)

# fit model
model.fit(X_train, y_train)

#make prediction
y_pred = model.predict(X_test)

# Evaluate Model Performance
root_mean_squared_error(y_test, y_pred)

# Save Results for Submission

Make predictions on the competition data with your trained model

In [None]:
test_predictions= model.predict(test)

In [None]:
file_name = "IvoTestSubmission.csv"

In [None]:
def save_submission_for_kaggle(file_name, test_predictions, test):
  import pandas as pd
  submission_data = pd.DataFrame({"ID": test["ID"], "Actual": test_predictions})
  submission_data.to_csv(file_name, index=False)

## Save submission file

In [None]:
save_submission_for_kaggle(file_name, test_predictions, test)

## FOR GOOGLE COLAB USERS ONLY: Download the created file

In [None]:
try:
  from google.colab import files
  files.download(file_name)
except ModuleNotFoundError:
  print("Not using Google Colab")

## FOR ANACONDA USERS ONLY: Find the created file in your folder structure

The file is located in the same directory as your notebook.

In [None]:
# run this cell if you don't know the location
import os
print(os.getcwd())

# How to get Going

- Try out other algorithms!
- Try out Cross Validation and Hyperparameter Tuning (see coding hint below)
- Try to understand why different models perform better or worse. Make Visualizations (Actual vs. Predicted Plots, Feature Importances, etc.)
- Try to make ensemble different predictions (average of multiple models)

# Implementation Help for Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import root_mean_squared_error, make_scorer

# fit model
RMSE = make_scorer(root_mean_squared_error, greater_is_better=False) #we create an RMSE scoring function
parameters = {"max_depth": [5,10]} # hyperparameters to be optimized
model_CV = GridSearchCV(DecisionTreeRegressor(), parameters, scoring=RMSE, cv=5, verbose=3) # Apply 5 Cross Validiation Folds to find best hyperparameters

Useful parameters:
- cv: specify the number of cross validation folds
- scoring: specify what score should be used: either custom scoring function like the RSME above or an already implemented scorer like scoring="accuracy" or "recall", or "precision",
- verbose: see the progress of the operation, e.g., verbose=3

After fitting the grid search cross-validation on the training data, you can use the "best_params_" attribute to display the best hyperparameter combination found in the grid search.

In [None]:
model_CV.fit(X, y)

Get the best score based on the cross validation. It is the mean of the five splits for the best parameter combination.

It is displayed as negative due to implementation reasons and the greater_is_better=False

In [None]:
model_CV.best_score_

Get detailed results. Rank 1 will have the lowest mean_test_score

In [None]:
pd.DataFrame(model_CV.cv_results_).sort_values(by="rank_test_score")

By relying on the mean of the five splits you generalize beyond the single split done in train_test_split.

# Implementation Help for Plotting Feature Importances

In [None]:
def plot_variable_importance(model, X_train):

    import matplotlib.pyplot as plt

    from pandas import DataFrame

    imp=DataFrame({"imp":model.feature_importances_, "names":X_train.columns}).sort_values("imp", ascending=True)

    fig, ax = plt.subplots(figsize=(imp.shape[0]/6,imp.shape[0]/5), dpi=300)

    ax.barh(imp["names"],imp["imp"], color="green")

    ax.set_xlabel('\nVariable Importance')

    ax.set_ylabel('Features\n')

    ax.set_title('Variable Importance Plot\n')

    plt.show()

plot_variable_importance(model, X_train)
