In [3]:
import os,time
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold,train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from lightgbm import LGBMRegressor

from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score


import optuna
sns.set_style('darkgrid')

#### Util functions

In [4]:
def load_regression(data_pth:str,target_col = None):
    data_df = pd.read_csv(data_pth)
    print(data_df.shape)
    types = data_df.dtypes
    temp = pd.concat([types,data_df.isna().sum()],axis=1)
    temp.columns = ['dtype','na_counts']
    print('-----------------------------------')
    print(temp)
    print('-----------------------------------')
    print('duplicate rows count : ',data_df.duplicated().sum())
    print('-----------------------------------')
    return data_df



def plot_predictions(model,X,y_true):
    predictions = model.predict(X)
    plt.scatter(y_true, predictions)
    plt.xlabel('Actual Labels')
    plt.ylabel('Predicted Labels')
    plt.title('Predictions')
    # overlay the regression line
    z = np.polyfit(y_true, predictions, 1)
    p = np.poly1d(z)
    plt.plot(y_true,p(y_true), color='magenta')
    plt.show()

### Read Data

#### Info About Features(If available)

In [None]:
train_pth = ''

train_data = load_regression(train_pth,target_col = None)

### EDA

#### Target Analysis

In [None]:
target_col = ''

label = train_data[target_col]
fig, ax = plt.subplots(1, 2, figsize = (12,6))
  
ax[0].hist(label)
ax[0].set_ylabel('Frequency')

ax[0].axvline(label.mean(), color='magenta', linestyle='dashed', linewidth=2)
ax[0].axvline(label.median(), color='cyan', linestyle='dashed', linewidth=2)
   
ax[1].boxplot(label, vert=False)
ax[1].set_xlabel('<Set label Name>')

fig.suptitle('<Set Label Name> Distribution')

fig.show()

#### Numerical Columns Analysis

In [None]:
numerical_features = []
for col in numeric_features:
    fig = plt.figure(figsize=(9, 6))
    ax = fig.gca()
    feature = train_data[col]
    feature.hist(bins=100, ax = ax)
    ax.axvline(feature.mean(), color='magenta', linestyle='dashed', linewidth=2)
    ax.axvline(feature.median(), color='cyan', linestyle='dashed', linewidth=2)
    ax.set_title(col)
plt.show()

In [None]:
for col in numeric_features:
    fig = plt.figure(figsize=(9, 6))
    ax = fig.gca()
    feature = bike_data[col]
    label = train_data[target_col]
    correlation = feature.corr(label)
    plt.scatter(x=feature, y=label)
    plt.xlabel(col)
    plt.ylabel(target_col)
    ax.set_title(target_col,' vs ' + col + '- correlation: ' + str(correlation))
plt.show()

#### Categorical Feature Analysis

In [None]:
categorical_features = []

for col in categorical_features:
    counts = train_data[col].value_counts().sort_index()
    fig = plt.figure(figsize=(9, 6))
    ax = fig.gca()
    counts.plot.bar(ax = ax, color='steelblue')
    ax.set_title(col + ' counts')
    ax.set_xlabel(col) 
    ax.set_ylabel("Frequency")
plt.show()

In [None]:
for col in categorical_features:
    fig = plt.figure(figsize=(9, 6))
    ax = fig.gca()
    bike_data.boxplot(column = target_col, by = col, ax = ax)
    ax.set_title('Label by ' + col)
    ax.set_ylabel(target_col)
plt.show()

### Creating Model

In [None]:
features_list = []
target_col
X,y = train_data[features_list],train_data[target_col]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)
print ('Training Set: %d rows\nTest Set: %d rows' % (X_train.shape[0], X_test.shape[0]))

#### Preprocessing

#### Model Training

##### Linear Regression

In [None]:
model = LinearRegression().fit(X_train, y_train)
print (model)

predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print("MSE:", mse)

rmse = np.sqrt(mse)
print("RMSE:", rmse)

r2 = r2_score(y_test, predictions)
print("R2:", r2)

plot_predictions(model,X_test,y_test)

#### HyperParameters Tuning

##### GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, r2_score

model

### Making Predictions

#### Saving Model

In [None]:
import joblib

filename = './<set model name>'
joblib.dump(model, filename)