In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from google.cloud import storage

In [2]:
%store -r eda_df

In [3]:
final_df = eda_df

In [4]:
display(final_df.head())

Unnamed: 0,day,month,year,hits,page_views,time_on_site,hour,minute,device,sub_continent,country,product_category,product_name,product_price,add_to_cart
0,30,11,2016,17,13,297,23,55,desktop,South America,Peru,Home/Shop by Brand/Google,Google Men's 100% Cotton Short Sleeve Hero Tee...,16.99,0
1,30,11,2016,17,13,297,23,58,desktop,South America,Peru,Home/Electronics,Google Device Holder Sticky Pad,4.99,0
2,30,11,2016,17,13,297,23,58,desktop,South America,Peru,Home/Electronics/Electronics Accessories,Grip Kit Cable Organizer,16.99,0
3,30,11,2016,17,13,297,23,58,desktop,South America,Peru,Home/Electronics,Electronics Accessory Pouch,4.99,0
4,30,11,2016,17,13,297,23,56,desktop,South America,Peru,Home/Apparel/Men's,Google Men's Watershed Full Zip Hoodie Grey,109.99,0


In [5]:
X = final_df.drop(['add_to_cart'], axis=1)
y = final_df['add_to_cart']

total_chunks = 10

# Define the size of each chunk
chunk_size = len(final_df) // total_chunks

# Initialize lists to store the chunked data
X_chunks_train = []
y_chunks_train = []

# Split the dataset into chunks
for i in range(total_chunks):
    start_index = i * chunk_size
    end_index = start_index + chunk_size if i < total_chunks - 1 else None
    
    # Extract the chunk of data
    X_chunk = X.iloc[start_index:end_index]
    y_chunk = y.iloc[start_index:end_index]
    
    # Split the chunk into train and test sets
    X_train_chunk, X_temp_chunk, y_train_chunk, y_temp_chunk = train_test_split(X_chunk, y_chunk, test_size=0.3, random_state=5)
    X_val_chunk, X_test_chunk, y_val_chunk, y_test_chunk = train_test_split(X_temp_chunk, y_temp_chunk, test_size=0.5, random_state=5)
    
    # Store the chunked data
    X_chunks_train.append(X_train_chunk)
    y_chunks_train.append(y_train_chunk)

# X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=5)

# X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=5)

In [6]:
categorical_columns = ['device', 'sub_continent', 'country', 'product_category', 'product_name']
numeric_columns = ['day', 'month', 'year', 'hits', 'page_views', 'time_on_site', 'hour', 'minute', 'product_price']

In [7]:
numerical_pipe = Pipeline(steps=[
    ('normalizer', StandardScaler())
])

In [8]:
categorical_pipe = Pipeline(steps=[
    ('one-hot', OneHotEncoder(handle_unknown='ignore'))
])

In [9]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipe, numeric_columns),
        ('cat', categorical_pipe, categorical_columns)
    ])

In [10]:
model = KNeighborsClassifier()

In [11]:
model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                                ])

In [12]:
models = []

# Train a model on each chunk and store it
for X_chunk, y_chunk in zip(X_chunks_train, y_chunks_train):
    model_pipeline.fit(X_chunk, y_chunk)
    models.append(model_pipeline)

# Evaluate the models on the validation set
validation_scores = []
for model in models:
    val_accuracy = model.score(X_val_chunk, y_val_chunk)
    validation_scores.append(val_accuracy)

In [13]:
average_validation_accuracy = sum(validation_scores) / len(validation_scores)
print("Average Validation Accuracy:", average_validation_accuracy)

Average Validation Accuracy: 0.99331644477982


In [17]:
model_2 = SVC()

In [18]:
model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model_2)
                                ])

In [19]:
models = []

# Train a model on each chunk and store it
for X_chunk, y_chunk in zip(X_chunks_train, y_chunks_train):
    model_pipeline.fit(X_chunk, y_chunk)
    models.append(model_pipeline)

# Evaluate the models on the validation set
validation_scores = []
for model in models:
    val_accuracy = model.score(X_val_chunk, y_val_chunk)
    validation_scores.append(val_accuracy)

In [20]:
average_validation_accuracy = sum(validation_scores) / len(validation_scores)
print("Average Validation Accuracy:", average_validation_accuracy)

Average Validation Accuracy: 0.9939093408074167


In [21]:
model_3 = DecisionTreeClassifier()

In [22]:
model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model_3)
                                ])

In [23]:
models = []

# Train a model on each chunk and store it
for X_chunk, y_chunk in zip(X_chunks_train, y_chunks_train):
    model_pipeline.fit(X_chunk, y_chunk)
    models.append(model_pipeline)

# Evaluate the models on the validation set
validation_scores = []
for model in models:
    val_accuracy = model.score(X_val_chunk, y_val_chunk)
    validation_scores.append(val_accuracy)

In [24]:
average_validation_accuracy = sum(validation_scores) / len(validation_scores)
print("Average Validation Accuracy:", average_validation_accuracy)

Average Validation Accuracy: 0.9930469465854579
