## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
%matplotlib inline

## Import the dataset

In [None]:
from sklearn.datasets import load_breast_cancer

In [None]:
breast_df = load_breast_cancer

In [None]:
# diabetes = pd.DataFrame(diabetes, columns=diabetes.feature_names)

In [None]:
breast_df

In [None]:
# rename_map = {
#     's1' : 'tc',
#     's2' : 'ldl',
#     's3' : 'hdl',
#     's4' : 'tch',
#     's5' : 'ltg',
#     's6' : 'glu'
# }
# diabetes.rename(columns=rename_map, inplace=True)

In [None]:
# diabetes.rename(columns=rename_map, inplace=True)

In [None]:
breast_df = load_breast_cancer()

In [None]:
breast_df.keys

In [None]:
print(breast_df.DESCR)

In [None]:
print(breast_df.data)

In [None]:
print(breast_df.target)

In [None]:
print(breast_df.feature_names)

## Preparing the Dataset

In [None]:
dataset = pd.DataFrame(breast_df.data, columns=breast_df.feature_names)

In [None]:
dataset.head()

In [None]:
dataset['Target'] = breast_df.target

In [None]:
dataset.head()

In [None]:
dataset.tail()

In [None]:
dataset.info()

In [None]:
# summary of the dataset
dataset.describe()

In [None]:
# check for any missing values
dataset.isnull()

In [None]:
dataset.isnull().sum()

## Exploratory Data Analysis

In [None]:
# check the dataset correlation
dataset.corr()

In [None]:
import seaborn as sns
sns.pairplot(dataset)

In [None]:
plt.scatter(dataset['mean radius'], dataset['Target'])
plt.xlabel('mean radius')
plt.ylabel('Target')

In [None]:
plt.scatter(dataset['mean texture'], dataset['Target'])
plt.xlabel('mean texture')
plt.ylabel('Target')

In [None]:
import seaborn as sns
sns.regplot(x='mean radius', y='Target', data=dataset)

### Dependent and Independent Variables

In [None]:
x = dataset.iloc[:,:-1]
y = dataset.iloc[:,-1]

In [None]:
x.head()

In [None]:
y

## Data Splitting

In [None]:
# Train Test Split
# Here, the dataset is split into training & testing data, in which 30% goes to the testing and 70% goes to the training dataset
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=40)

In [None]:
x_train

In [None]:
x_test

In [None]:
y_train

In [None]:
y_test

## Data Standardization

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
import pickle
pickle.dump(scaler,open('scaling.pkl', 'wb'))

In [None]:
x_train

In [None]:
x_test

# Model Training

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
regression = LinearRegression()
regression.fit(x_train, y_train)

In [None]:
# print the coefficients and the intercept
print(regression.coef_)

In [None]:
 print(regression.intercept_)

In [None]:
# know the regression parameters
regression.get_params()

# Model Prediction

In [None]:
# Model Prediction on Test Data
reg_pred = regression.predict(x_test)

In [None]:
x_test

In [None]:
# Model Prediction on Training Data
reg_pred = regression.predict(x_train)

In [None]:
x_train

# Data Visualization

In [None]:
# Plot the scatterplot for the prediction on training data
plt.scatter(y_train, reg_pred) # y_test

In [None]:
print(x.shape)
print(y.shape)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
# Residuals (errors)
residuals = y_train-reg_pred

In [None]:
# plot the residuals
sns.displot(residuals, kind="kde")

In [None]:
# Scatter plot with respect to prediction and residuals
# uniform distribution
plt.scatter(reg_pred, residuals)

# Model Evaluation

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
# , r2_score
# from math import sqrt

In [None]:
train_mae = mean_absolute_error(y_train, reg_pred)
train_mse = mean_squared_error(y_train, reg_pred)
# train_rmse = (np.sqrt(mean_squared_error(y_test, reg_pred)))
# r2_score = r2_score(y_train, reg_pred)

In [None]:
print(train_mae)
print(train_mse)
# print(r2_score)

# R-Squared and Adjusted R-Squared
## Formula
### R^2 = 1 - SSR/SST
### R^2 = Coefficient of Determination, SSR = Sum of Squared Residuals, SST = Sum of Squared Total

In [None]:
from sklearn.metrics import r2_score
score = r2_score(y_train, reg_pred)
print(score)

### Adjusted R2 = 1 - [(1 - R2) * (n-1) / (n-k-1)]
### where 
### R2 = The R2 of the model. n = The number of observations. k = The number of predictor variables

In [None]:
1 - (1 - score) * (len(y_test) - 1)/(len(y_test) - x_test.shape[1]-1)

## New Data Prediction

In [None]:
breast_df.data

In [None]:
breast_df.data[0].reshape(1,-1)

In [None]:
# Transformation on a new data
scaler.transform(breast_df.data[0].reshape(1,-1))

In [None]:
# predict the transformed data
regression.predict(scaler.transform(breast_df.data[0].reshape(1,-1)))

# Pickle the Model File For Deployment

In [None]:
import pickle

In [None]:
pickle.dump(regression, open('regmodel.pkl','wb'))

In [None]:
pickled_model = pickle.load(open('regmodel.pkl', 'rb'))

In [None]:
pickled_model.predict(scaler.transform(breast_df.data[0].reshape(1,-1)))