# Predicting target

### Data Set Information:

**A dataset containing the targets and 24 features of 2930 individual objects(rows).**

In [None]:
# Importing useful libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from collections import OrderedDict
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import SelectKBest, chi2, f_classif, RFE

# 1. Data Preparation

In [None]:
# importing data
predictive = pd.read_csv('prediktiv_data.csv',index_col='id')

## Outliers and NaN

In [None]:
# number of NaN values
nan_per_col =predictive.isna().sum()

In [None]:
# Showing all variables with missing values
missing_values = nan_per_col[nan_per_col>0]
missing_values

In [None]:
# Deleting columns with majority of missing values
predictive.drop(['feature11','feature12','feature15'],axis=1, inplace=True)

In [None]:
# Categorical, use most frequent
print(predictive['feature13'].value_counts())
predictive['feature13'].fillna('red', inplace=True)

In [None]:
# Categorical, use most frequent
print(predictive['feature01'].value_counts())
predictive['feature01'].fillna(2, inplace=True)

In [None]:
predictive.boxplot('feature02')

In [None]:
# Feature02 has many outliers, NaN are replaced by median
imputer =SimpleImputer(strategy='median', missing_values=np.nan)
imputer = imputer.fit(predictive[['feature02']]) 
predictive['feature02'] = imputer.transform(predictive[['feature02']])

In [None]:
predictive.boxplot('feature04')

In [None]:
# Feature04 has many outliers, NaN are replaced by median
imputer =SimpleImputer(strategy='median', missing_values=np.nan)
imputer = imputer.fit(predictive[['feature04']]) 
predictive['feature04'] = imputer.transform(predictive[['feature04']])

##### One of the transformations we must perform is to tranform the categorical features to the dummy-variable format.

In [None]:
# Create dummy variables
predictive=pd.concat([predictive,pd.get_dummies(predictive['feature13'],prefix='feature13',drop_first=True)],axis=1)
predictive=pd.concat([predictive,pd.get_dummies(predictive['feature16'],prefix='feature16',drop_first=True)],axis=1)
predictive.drop(['feature13','feature16'], axis=1, inplace=True)

**Visualizing the distribution of target.**

In [None]:
plt.figure(figsize=(8,8))
plt.title('Target Distribution Plot')
sns.histplot(predictive['target'],kde=True);

In [None]:
# Square root transformation is used to normalize our target variable
predictive['target'] = np.sqrt(predictive['target'])
sns.histplot(predictive['target'],kde=True);

In [None]:
predictive.hist()
plt.show()

#### Many variables have a skewed distribution according to histplot above, the dataset provides a good candidate for using a robust scaler transform to standardize the data in the presence of skewed distributions and outliers. (only continuously input variables can be preprocessed by robust scaling)

## Using all the features

### Split the data into: trainning and testing (cross-validation)

In [None]:
target_name = 'target'
robust_scaler=RobustScaler()
x =predictive.drop('target', axis=1)
feature_names=x.columns
x=robust_scaler.fit_transform(x)
y = predictive[target_name]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=123)

# 2. Import the estimator object (model)

In [None]:
from sklearn.linear_model import LinearRegression

# 3. Create an instance of the estimator

In [None]:
linear_regression = LinearRegression()

# 4. Use the trainning data to train the estimator

In [None]:
linear_regression.fit(x_train, y_train)

# 5. Evaluate the model

In [None]:
from sklearn.metrics import mean_squared_error
y_pred_test = linear_regression.predict(x_test)
error_metric = mean_squared_error(y_pred=y_pred_test, y_true=y_test)
print('The Mean Square Error of this model is: ', error_metric)

In [None]:
# Print coefficients and intercept for the multiple regression model
inter = linear_regression.intercept_
weights = linear_regression.coef_
print(f'The intercept of the trained model is {inter} and the weights are {weights}')

In [None]:
fig, ax = plt.subplots()
ax.scatter(y_test, y_pred_test,s=4)
ax.plot(y_test, y_test, color='red')
ax.set_xlabel('Testing target values')
ax.set_ylabel('Predicted target values')
ax.set_title('Predicted vs. Actual values');

This is a plot of the true target *y_true* plotted against the predicted target, *y_pred*. Note that the red line is not the one-dimensional plot of the linear regression. It is the true y plotted against itself. This will always create a 45 degree straight line. We want all the scatter plots to be as close to this line as possible because this means y_pred = y_true

# Making predictions with the most relevant features

In [None]:
# Choose 20 features which have the highest correlation with the target
top_corr_features = predictive.corr().loc['target'].apply(np.abs).sort_values(ascending=False).index[1:21]
top_corr_features = list(top_corr_features)
top_corr_features

In [None]:
# Check multicollinearity between these chosen features
plt.figure(figsize = (20, 20))
sns.heatmap(predictive[top_corr_features].corr(), cmap="RdYlGn")
plt.show()

## Getting the train and test sets and scaling

In [None]:
x = predictive[top_corr_features]
target_name = 'target'
y = predictive[target_name]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=123)

### Preparing a DataFrame for model analysis

In [None]:
model_mean = pd.DataFrame(index=['MSE', 'RMSE', 'MAE'], 
columns=['NULL', 'MLR','KNN','LASSO','RandomForest'])

# Making predictions with just a few features (top 20)

### The Null model: always predict the average of the target

In [None]:
from sklearn.metrics import mean_absolute_error
y_pred_null = y_train.mean()
model_mean.loc['MSE','NULL'] = mean_squared_error(y_pred=np.repeat(y_pred_null, y_test.size), y_true=y_test)
model_mean.loc['RMSE','NULL'] = mean_squared_error(y_pred=np.repeat(y_pred_null, y_test.size), y_true=y_test, squared=False)
model_mean.loc['MAE','NULL'] = mean_absolute_error(y_pred=np.repeat(y_pred_null, y_test.size), y_true=y_test)

### A. Multiple Linear Regression

In [None]:
# 1. Import the estimator object (model)
from sklearn.linear_model import LinearRegression
# 2. Create an instance of the estimator
linear_regression = LinearRegression()
# 3. Use the training data to train the estimator
linear_regression.fit(x_train, y_train)
# 4. Evaluate the model
model_mean.loc['MSE','MLR'] = mean_squared_error(y_pred=linear_regression.predict(x_test), y_true=y_test)
model_mean.loc['RMSE','MLR'] = mean_squared_error(y_pred=linear_regression.predict(x_test), y_true=y_test, squared=False)
model_mean.loc['MAE','MLR'] = mean_absolute_error(y_pred=linear_regression.predict(x_test), y_true=y_test)

### B. K-Nearest Neighbors Model

In [None]:
# 1. Import the estimator object (model)
from sklearn.neighbors import KNeighborsRegressor
# 2. Create an instance of the estimator
knn = KNeighborsRegressor(n_neighbors=10, weights='distance', metric='euclidean', n_jobs=-1)
# 3. Use the trainning data to train the estimator
knn.fit(x_train, y_train)
# 4. Evaluate the model
model_mean.loc['MSE','KNN'] = mean_squared_error(y_pred=knn.predict(x_test), y_true=y_test)
model_mean.loc['RMSE','KNN'] = mean_squared_error(y_pred=knn.predict(x_test), y_true=y_test, squared=False)
model_mean.loc['MAE','KNN'] = mean_absolute_error(y_pred=knn.predict(x_test), y_true=y_test)

### C. LASSO

In [None]:
from sklearn.linear_model import Lasso
# 2. Create an instance of the estimator
lasso = Lasso(alpha=0.05)                            
# 3. Use the training data to train the estimator
lasso.fit(x_train, y_train)
# 4. Evaluate the model
model_mean.loc['MSE','LASSO'] = mean_squared_error(y_pred=lasso.predict(x_test), y_true=y_test)
model_mean.loc['RMSE','LASSO'] = mean_squared_error(y_pred=lasso.predict(x_test), y_true=y_test, squared=False)
model_mean.loc['MAE','LASSO'] = mean_absolute_error(y_pred=lasso.predict(x_test), y_true=y_test)

### D. Random Forests

In [None]:
from sklearn.ensemble import RandomForestRegressor
# 2. Create an instance of the estimator
RF = RandomForestRegressor(n_estimators=50, max_depth=20,random_state=40, n_jobs=-1)
# 3. Use the training data to train the estimator
RF.fit(x_train, y_train)
# 4. Evaluate the model
model_mean.loc['MSE','RandomForest'] = mean_squared_error(y_pred=RF.predict(x_test), y_true=y_test)
model_mean.loc['RMSE','RandomForest'] = mean_squared_error(y_pred=RF.predict(x_test), y_true=y_test, squared=False)
model_mean.loc['MAE','RandomForest'] = mean_absolute_error(y_pred=RF.predict(x_test), y_true=y_test)

In [None]:
model_mean

In [None]:
fig, ax = plt.subplots(figsize=(8,5))
model_mean.T.plot(kind='barh', ax=ax)
ax.set_title('MSE for Regression Models')
ax.legend(loc=3);

In [None]:
fig, ax = plt.subplots(figsize=(8,5))
model_mean.loc['MSE'].plot(kind='barh', ax=ax)
ax.set_title('Test MSE for Regression Models')
ax.legend(loc=8, ncol=4);

In [None]:
lasso.coef_

In [None]:
print('Features automatically choosen by Lasso:\n')
for i,var in enumerate(x.columns[lasso.coef_>0]):
    print("{}.{}".format(i+1, var))

## Use best model LASSO to evaluate the model

In [None]:
fig, ax = plt.subplots(figsize=(8,5))
ax.scatter(lasso.predict(x_test), y_test, s=4)
ax.plot(y_test, y_test, color='red')
ax.set_title('LASSO: predictions vs. observed values (test data)')
ax.set_xlabel('Predicted target')
ax.set_ylabel('Testing target');