**You can find the data in below link:**
[Donwload Data](https://www.kaggle.com/c/bluebook-for-bulldozers/data)

# Regression project

In [None]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

In [None]:
# import the data
df = pd.read_csv('TrainAndValid.csv', low_memory=False)
df.head()

In [None]:
# info of data
df.info()

In [None]:
# check the none value
df.isna().sum()

In [None]:
df['saledate'].dtype

In [None]:
# make the data visualize
fig, ax = plt.subplots()
ax.scatter(df['saledate'][:1000], df['SalePrice'][:1000])

In [None]:
# check it with histogram
df['SalePrice'].plot.hist()

### Parsing dates

In [None]:
# import the data with parse dates
df = pd.read_csv('TrainAndValid.csv', low_memory=False, parse_dates=['saledate'])

In [None]:
# check the type of saledate
df['saledate'].dtype

In [None]:
# see the head of saledate
df['saledate'].head()

In [None]:
# make data visualize
fig, ax = plt.subplots()
ax.scatter(df['saledate'][:1000], df['SalePrice'][:1000])

In [None]:
# to see the full data
df.head().T

In [None]:
# sort dataframe by sale-date
df.sort_values(by=['saledate'], ascending=True,  inplace=True)

In [None]:
# see the sorted data
df['saledate'].head(20)

In [None]:
# make a copy of our dataframe 
df_tmp = df.copy()

In [None]:
# check the df_tmp data
df_tmp['saledate'].head(20)

In [None]:
# create new column 'saleYear'
df_tmp['saleYear'] = df_tmp['saledate'].dt.year
df_tmp['saleMonth'] = df_tmp['saledate'].dt.month
df_tmp['saleDay'] = df_tmp['saledate'].dt.day
df_tmp['saleDayOfWeek'] = df_tmp['saledate'].dt.dayofweek
df_tmp['saleDayOfYear'] = df_tmp['saledate'].dt.dayofyear

In [None]:
# see the column we created
df_tmp['saleYear'].head(10)

In [None]:
# see all the data 
df_tmp.head().T

In [None]:
# drop the sale-date column because we don't need it
df_tmp.drop('saledate', axis=1, inplace=True)

In [None]:
# check the values of diffrent columns
df_tmp['state'].value_counts()

### Modelling 

In [None]:
# import RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor
# instantiate the model 
model = RandomForestRegressor(n_jobs=-1, random_state=80)
# fit the data to the model
# model.fit(df_tmp.drop('SalePrice', axis=1), df_tmp['SalePrice'])
# we'll get an error because of none and non-numerical values

### Make all the data numerical (Categorical)

In [None]:
# check if the data is string or not 
pd.api.types.is_string_dtype(df_tmp['UsageBand'])

In [None]:
# find the columns which contains string
for label, content in df_tmp.items():
    if pd.api.types.is_string_dtype(content):
        print(label)

In [None]:
# turn all the string values into categories
for label, content in df_tmp.items():
    if pd.api.types.is_string_dtype(content):
        df_tmp[label] = content.astype('category').cat.as_ordered()

In [None]:
df_tmp.info()

In [None]:
# see the data type
df_tmp['state'].cat.categories

In [None]:
# see the code of categories
df_tmp['state'].cat.codes

In [None]:
# check the ratio of missing data
df_tmp.isnull().sum() / len(df_tmp)

### save the preprocessed data

In [None]:
# export data
df_tmp.to_csv('train_tmp.csv', index=False)

In [None]:
# import preprocessed data
df_tmp = pd.read_csv('train_tmp.csv', low_memory=False)

In [None]:
df_tmp.head(3).T

### Fill the missing values