In [10]:
import pandas as pd

# URL of the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00246/3D_spatial_network.txt"

# Define column names
columns = ['OSM_ID', 'LONGITUDE', 'LATITUDE', 'ALTITUDE']

# Load data in chunks
chunksize = 20000
data_iterator = pd.read_csv(url, header=None, names=columns, chunksize=chunksize, iterator=True)
data_iterator

<pandas.io.parsers.readers.TextFileReader at 0x7b5bc0b02230>

In [14]:
from sklearn.preprocessing import StandardScaler

# Initialize scaler
scaler = StandardScaler()

# Fit the scaler on all three features
data_iterator = pd.read_csv(url, header=None, names=columns, chunksize=chunksize, iterator=True)
for chunk in data_iterator:
    X_chunk = chunk[['LONGITUDE', 'LATITUDE', 'ALTITUDE']].values  # Include ALTITUDE
    scaler.partial_fit(X_chunk)


In [15]:
from sklearn.linear_model import SGDRegressor

# Initialize SGDRegressor
sgd_regressor = SGDRegressor(random_state=10)

# Reload data iterator
data_iterator = pd.read_csv(url, header=None, names=columns, chunksize=chunksize, iterator=True)

# Train SGDRegressor in chunks
for iteration, chunk in enumerate(data_iterator, start=1):
    # Scale all three features
    X_chunk = chunk[['LONGITUDE', 'LATITUDE', 'ALTITUDE']].values
    X_chunk_scaled = scaler.transform(X_chunk)  # Transform all three features

    # Use only LONGITUDE and LATITUDE as input to the model
    X_chunk_model = X_chunk_scaled[:, :2]  # Select first two columns
    y_chunk = chunk['ALTITUDE'].values  # Target

    # Perform partial fit
    sgd_regressor.partial_fit(X_chunk_model, y_chunk)

    # Break after 7 iterations
    if iteration == 7:
        break

In [17]:
X_chunk.shape

(20000, 3)

In [18]:
print("Intercept after 7th iteration:", sgd_regressor.intercept_)


Intercept after 7th iteration: [21.39333846]


In [19]:
longitude_coefficient = sgd_regressor.coef_[0]  # Longitude is the first feature
print("Coefficient for longitude after 7th iteration:", longitude_coefficient)


Coefficient for longitude after 7th iteration: 4.792474397339679


In [20]:
# Reload data iterator
data_iterator = pd.read_csv(url, header=None, names=columns, chunksize=chunksize, iterator=True)

# Initialize a counter
total_samples = 0

# Iterate through the chunks to count rows
for chunk in data_iterator:
    total_samples += len(chunk)

print("Total number of samples (rows):", total_samples)


Total number of samples (rows): 434874
