In [7]:
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# The StandardScaler from scikit-learn is a preprocessing step often used in machine learning workflows, including
# with models like SGDRegressor. It's used for standardizing features by removing the mean and scaling to unit variance.
# When you scale your data using StandardScaler or any other scaling method, it's important to remember how to inverse
# transform it back to the original scale, especially when you want to interpret your model's predictions in terms of the original data.
from sklearn.metrics import mean_squared_error, r2_score



In [11]:
!pip install jupyterthemes

Collecting jupyterthemes
  Using cached jupyterthemes-0.20.0-py2.py3-none-any.whl (7.0 MB)
Collecting lesscpy>=0.11.2 (from jupyterthemes)
  Obtaining dependency information for lesscpy>=0.11.2 from https://files.pythonhosted.org/packages/2a/da/4a20ba69c9c71ce3d522da5a3c617254d1d1430b71f52b7d46dbf2c06b96/lesscpy-0.15.1-py2.py3-none-any.whl.metadata
  Downloading lesscpy-0.15.1-py2.py3-none-any.whl.metadata (6.0 kB)
Using cached lesscpy-0.15.1-py2.py3-none-any.whl (46 kB)
Installing collected packages: lesscpy, jupyterthemes
Successfully installed jupyterthemes-0.20.0 lesscpy-0.15.1


In [5]:
!pip show jupyterthemes

Name: jupyterthemes
Version: 0.20.0
Summary: Select and install a Jupyter notebook theme
Home-page: https://github.com/dunovank/jupyter-themes
Author: dunovank
Author-email: dunovank@gmail.com
License: MIT
Location: D:\anaco\Lib\site-packages
Requires: ipython, jupyter-core, lesscpy, matplotlib, notebook
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [12]:
!jt -r

Reset css and font defaults in:
C:\Users\Yuvraj Singh Rathore\.jupyter\custom &
C:\Users\Yuvraj Singh Rathore\AppData\Roaming\jupyter\nbextensions


In [8]:
#LOAD THE DATA
california_housing = fetch_california_housing()
X = california_housing.data
y = california_housing.target



In [9]:
#Split the data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [4]:
scaler = StandardScaler()
#Create an instance of StandardScaler:

In [5]:
'''<!-- batch_size = 20# each batch contain 20 rows of data 
num_batches = len(X_train) // batch_size # // This operation divides one number by another and returns the integer part of the result, discarding any fractional part.
# in our batch num_Bathces=825
# Create  an instance of the SGDRegressor class from sklearn.linear_model
sgd_regressor = SGDRegressor(max_iter=1, alpha=0.0001, random_state=42)



# This parameter defines the maximum number of iterations (epochs) that the model will undergo. In this case, it's 
# set to 1. This means that the model will only see each batch of data once during training.
# If you have 1000 samples in your dataset and you set batch_size to 100:
# It will take 10 iterations (batches) to complete one epoch (1000 samples / 100 batch size = 10 iterations).
# Multiple epochs allow the model to learn from the entire dataset multiple times, improving its performance with each pass.
# With a small alpha:

# The curve might wiggle a lot to fit each point exactly. This could be good for complex patterns in the data, but it might also be too specific to the training data. This can lead to overfitting.
# With a large alpha:

# The curve might be a simpler, smoother line. It might not fit each point perfectly, but it captures the overall
# trend of the data. This helps prevent overfitting and makes the model generalize better to new, unseen data.

# "more complex models might need stronger regularization," we mean that these complex models tend to 
# learn intricate patterns and details from the training data, which can lead to overfitting. To counteract this
# tendency, we often need stronger regularization, which translates to a larger alpha value.
for epoch in range(num_batches):
# This loop iterates over the number of batches (num_batches) you have defined.
# Each iteration represents one "epoch" where the model sees the entire dataset in smaller batches.
    start_idx = epoch * batch_size
    end_idx = (epoch + 1) * batch_size
#epoch + 1 is used to calculate the ending index of each batch, ensuring correct and non-overlapping slicing of the
    # dataset for training in mini-batch scenarios.

    X_batch = X_train[start_idx:end_idx]
    y_batch = y_train[start_idx:end_idx]

    # Scale the batch
    X_batch_scaled = scaler.fit_transform(X_batch)

    # Partial fit on the scaled batch
    sgd_regressor.partial_fit(X_batch_scaled, y_batch)

# If there are remaining samples that don't fit into a full batch
if len(X_train) % batch_size != 0:
    X_remaining = X_train[num_batches * batch_size:]
    # This slices the remaining samples starting from the index where the last full batch ends.
    y_remaining = y_train[num_batches * batch_size:]
    
    X_remaining_scaled = scaler.fit_transform(X_remaining)
    
    sgd_regressor.partial_fit(X_remaining_scaled, y_remaining)#partial fit remaining batches
 -->

# if you have 1000 training examples and your batch size is 500 then it will take 2 iterations to complete 1 epoch.
 '''

'<!-- batch_size = 20# each batch contain 20 rows of data \nnum_batches = len(X_train) // batch_size # // This operation divides one number by another and returns the integer part of the result, discarding any fractional part.\n# in our batch num_Bathces=825\n# Create  an instance of the SGDRegressor class from sklearn.linear_model\nsgd_regressor = SGDRegressor(max_iter=1, alpha=0.0001, random_state=42)\n\n\n\n# This parameter defines the maximum number of iterations (epochs) that the model will undergo. In this case, it\'s \n# set to 1. This means that the model will only see each batch of data once during training.\n# If you have 1000 samples in your dataset and you set batch_size to 100:\n# It will take 10 iterations (batches) to complete one epoch (1000 samples / 100 batch size = 10 iterations).\n# Multiple epochs allow the model to learn from the entire dataset multiple times, improving its performance with each pass.\n# With a small alpha:\n\n# The curve might wiggle a lot to fit

In [10]:
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

# Load the California Housing dataset
california_housing = fetch_california_housing()
X = california_housing.data
y = california_housing.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the StandardScaler instance
scaler = StandardScaler()

# Train in Batches using partial_fit
batch_size = 20
num_batches = len(X_train) // batch_size

# Create the SGDRegressor
sgd_regressor = SGDRegressor(max_iter=1, alpha=0.0001, random_state=42)

for epoch in range(num_batches):
    start_idx = epoch * batch_size
    end_idx = (epoch + 1) * batch_size

    X_batch = X_train[start_idx:end_idx]
    y_batch = y_train[start_idx:end_idx]

    # Scale the batch
    X_batch_scaled = scaler.fit_transform(X_batch)

    # Partial fit on the scaled batch
    sgd_regressor.partial_fit(X_batch_scaled, y_batch)

# If there are remaining samples that don't fit into a full batch
if len(X_train) % batch_size != 0:
    X_remaining = X_train[num_batches * batch_size:]
    y_remaining = y_train[num_batches * batch_size:]

    # Scale the remaining samples using the already fitted scaler
    X_remaining_scaled = scaler.transform(X_remaining)
    
    # Partial fit on the scaled remaining samples
    sgd_regressor.partial_fit(X_remaining_scaled, y_remaining)

# Make predictions on the test data
X_test_scaled = scaler.transform(X_test)
predictions = sgd_regressor.predict(X_test_scaled)

# Create a DataFrame for predicted and actual values
results = pd.DataFrame({'Actual': y_test, 'Predicted': predictions})

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

# Print the predicted and actual table
print(results)

# If you want to continue training the same SGDRegressor model after making predictions, you can simply call the 
# partial_fit method again on new batches of data. This is similar to the initial training process. you can certainly change the batch size when training your model using partial_fit
#use library river and vowpal wabbit


Mean Squared Error: 1.0112121506645142
R^2 Score: 0.22832324300618656
       Actual  Predicted
0     0.47700   0.257650
1     0.45800   1.729494
2     5.00001   3.130029
3     2.18600   2.601453
4     2.78000   2.414003
...       ...        ...
4123  2.63300   1.997936
4124  2.66800   2.201621
4125  5.00001   4.501551
4126  0.72300   1.305187
4127  1.51500   1.566215

[4128 rows x 2 columns]
