In [1]:

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import numpy as np
from sklearn.preprocessing import LabelEncoder

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\OWNER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\OWNER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\OWNER\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data] 

True

In [2]:
file_path = 'alpha2_dataset_cleaned.csv'

df = pd.read_csv(file_path)
df = df.fillna(pd.NA)

print(df.head())
df.info()

                                              name      type  price  \
0                Duracell - AAA Batteries (4-Pack)  HardGood   5.49   
1  Duracell - AA 1.5V CopperTop Batteries (4-Pack)  HardGood   5.49   
2                 Duracell - AA Batteries (8-Pack)  HardGood   7.49   
3            Energizer - MAX Batteries AA (4-Pack)  HardGood   4.99   
4                  Duracell - C Batteries (4-Pack)  HardGood   8.99   

                                         description manufacturer  \
0  Compatible with select electronic devices; AAA...     Duracell   
1  Long-lasting energy; DURALOCK Power Preserve t...     Duracell   
2  Compatible with select electronic devices; AA ...     Duracell   
3  4-pack AA alkaline batteries; battery tester i...    Energizer   
4  Compatible with select electronic devices; C s...     Duracell   

                                           url              parent_category  \
0                duracell aaa batteries 4 pack  Connected Home & Housewares   


In [3]:
# stemmer, lemmatizer and stopwords
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from typing import Optional

# Initialize NLTK resources
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return 'a'  # Adjective
    elif tag.startswith('V'):
        return 'v'  # Verb
    elif tag.startswith('N'):
        return 'n'  # Noun
    elif tag.startswith('R'):
        return 'r'  # Adverb
    else:
        return 'n'  # Default to noun if not recognized

def remove_extra_new_lines(text):

    if pd.isnull(text):  # check if text is nan
        return ''  # replace with an empty string

    clean_text = [i for i in str(text).splitlines() if i.strip()]
    clean_text = ' '.join(clean_text)
    return clean_text

def remove_extra_whitespace(text: str) -> str:

    spaceless_text = re.sub(r'\s+', ' ', text)
    return spaceless_text

def remove_special_chars(text: str, remove_digits: Optional[bool] = False) -> str:

    if remove_digits:
        pattern = r'[^a-zA-Z\s]'
    else:
        pattern = r'[^a-zA-Z0-9\s]'

    cleaned_text = re.sub(pattern, '', text)
    return cleaned_text

def normalize_text(text):

    text = remove_extra_new_lines(text)

    text = remove_extra_whitespace(text)

    text = remove_special_chars(text, remove_digits=False)

    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens if token.isalpha() and token.lower() not in stop_words]
    tagged_tokens = pos_tag(tokens)
    lemmas = [lemmatizer.lemmatize(token, get_wordnet_pos(tag)) for token, tag in tagged_tokens]

    return ' '.join(lemmas)

In [4]:
normalization = ['name', 'description']
for column in normalization:
    df[column + '_normalized'] = df[column].apply(normalize_text)

print(df.shape)

(50041, 13)


In [5]:
print(df.iloc[0])


name                                      Duracell - AAA Batteries (4-Pack)
type                                                               HardGood
price                                                                  5.49
description               Compatible with select electronic devices; AAA...
manufacturer                                                       Duracell
url                                           duracell aaa batteries 4 pack
parent_category                                 Connected Home & Housewares
sub_category_1                                                   Housewares
sub_category_2                                          Household Batteries
sub_category_3                                           Alkaline Batteries
sub_category_4                                                         <NA>
name_normalized                                        duracell aaa battery
description_normalized    compatible select electronic device aaa size d...
Name: 0, dty

In [6]:
#df['sub_category_1'].fillna('0', inplace=True)
#df['sub_category_2'].fillna('0', inplace=True)
#df['sub_category_3'].fillna('0', inplace=True)
#df['sub_category_4'].fillna('0', inplace=True)
print(df['parent_category'].unique())

['Connected Home & Housewares' 'other' 'Car Electronics & GPS'
 'In-Store Only' 'Musical Instruments' 'Toys' 'Video Games'
 'Cameras & Camcorders' 'Computers & Tablets' 'Appliances' 'Audio'
 'TV & Home Theater' 'Health' 'Name Brands' 'Cell Phones' 'Movies & Music'
 'Magnolia Home Theater' 'Geek Squad' 'Best Buy Gift Cards'
 'H/VG_X360/Games/B2G1_20130602' 'MP Exclusives' 'Wearable Technology'
 'Custom Parts']


In [7]:
import h5py

print(df.head())
df.shape
X = df.drop(columns=['parent_category'])
y = df['parent_category']

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

with h5py.File('label_encoder.h5', 'w') as hf:
    hf.create_dataset('label_encoder', data=label_encoder.classes_)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

                                              name      type  price  \
0                Duracell - AAA Batteries (4-Pack)  HardGood   5.49   
1  Duracell - AA 1.5V CopperTop Batteries (4-Pack)  HardGood   5.49   
2                 Duracell - AA Batteries (8-Pack)  HardGood   7.49   
3            Energizer - MAX Batteries AA (4-Pack)  HardGood   4.99   
4                  Duracell - C Batteries (4-Pack)  HardGood   8.99   

                                         description manufacturer  \
0  Compatible with select electronic devices; AAA...     Duracell   
1  Long-lasting energy; DURALOCK Power Preserve t...     Duracell   
2  Compatible with select electronic devices; AA ...     Duracell   
3  4-pack AA alkaline batteries; battery tester i...    Energizer   
4  Compatible with select electronic devices; C s...     Duracell   

                                           url              parent_category  \
0                duracell aaa batteries 4 pack  Connected Home & Housewares   


In [8]:
df.shape

X_1 = df.drop(columns=['sub_category_1'])
y_1 = df['sub_category_1']
y_1.fillna('missing', inplace=True)

label_encoder_1 = LabelEncoder()
y_1_encoded = label_encoder_1.fit_transform(y_1)

with h5py.File('label_encoder_1.h5', 'w') as hf:
    hf.create_dataset('label_encoder_1', data=label_encoder_1.classes_)

X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_1, y_1_encoded, test_size=0.2, random_state=42)

In [9]:
df.shape
X_2 = df.drop(columns=['sub_category_2'])
y_2 = df['sub_category_2']
y_2.fillna('missing', inplace=True)
label_encoder_2 = LabelEncoder()
y_2_encoded = label_encoder_2.fit_transform(y_2)

with h5py.File('label_encoder_2.h5', 'w') as hf:
    hf.create_dataset('label_encoder_2', data=label_encoder_2.classes_)

X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_2, y_2_encoded, test_size=0.2, random_state=42)

In [10]:
df.shape
X_3 = df.drop(columns=['sub_category_3'])
y_3 = df['sub_category_3']
y_3.fillna('missing', inplace=True)

label_encoder_3 = LabelEncoder()
y_3_encoded = label_encoder_3.fit_transform(y_3)

with h5py.File('label_encoder_3.h5', 'w') as hf:
    hf.create_dataset('label_encoder_3', data=label_encoder_3.classes_)

X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(X_3, y_3_encoded, test_size=0.2, random_state=42)

In [11]:
df.shape
X_4 = df.drop(columns=['sub_category_4'])
y_4 = df['sub_category_4']
y_4.fillna('missing', inplace=True)

label_encoder_4 = LabelEncoder()
y_4_encoded = label_encoder_4.fit_transform(y_4)

with h5py.File('label_encoder_4.h5', 'w') as hf:
    hf.create_dataset('label_encoder_4', data=label_encoder_4.classes_)

X_train_4, X_test_4, y_train_4, y_test_4 = train_test_split(X_4, y_4_encoded, test_size=0.2, random_state=42)

In [12]:
print(X_4.head())


                                              name      type  price  \
0                Duracell - AAA Batteries (4-Pack)  HardGood   5.49   
1  Duracell - AA 1.5V CopperTop Batteries (4-Pack)  HardGood   5.49   
2                 Duracell - AA Batteries (8-Pack)  HardGood   7.49   
3            Energizer - MAX Batteries AA (4-Pack)  HardGood   4.99   
4                  Duracell - C Batteries (4-Pack)  HardGood   8.99   

                                         description manufacturer  \
0  Compatible with select electronic devices; AAA...     Duracell   
1  Long-lasting energy; DURALOCK Power Preserve t...     Duracell   
2  Compatible with select electronic devices; AA ...     Duracell   
3  4-pack AA alkaline batteries; battery tester i...    Energizer   
4  Compatible with select electronic devices; C s...     Duracell   

                                           url              parent_category  \
0                duracell aaa batteries 4 pack  Connected Home & Housewares   


In [13]:
print(X_train.shape)
print( X_test.shape)
print(y_train.shape)
print(y_test.shape)
print(X_train['name_normalized'])

(40032, 12)
(10009, 12)
(40032,)
(10009,)
9233                            star fox preowned nintendo
25631    pioneer networkready ultra hd passthrough av h...
19030                     evolve ultimate edition xbox one
12044    joby pro series ultraplate quickrelease plate ...
18967    aluratek bump w home audio speaker system ipod...
                               ...                        
11284    samsung class diag lead curved smart ultra hd ...
44732    hifonics brutus class mono mosfet subwoofer am...
38158    mobile edge premium laptop backpack apple macb...
860                                 presonus presonus gray
15795      insignia portable bluetooth stereo speaker blue
Name: name_normalized, Length: 40032, dtype: object


In [14]:
y_train

array([20,  1, 20, ...,  6, 16,  1])

One Hot Encoder y Scaler


Stage 1

In [15]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import pickle

# Define columns
categorical_columns = ['type', 'manufacturer']
numerical_columns = ['price']
text_columns = ['name_normalized', 'description_normalized']

# Fill missing values
X_train[categorical_columns] = X_train[categorical_columns].fillna('missing')
X_test[categorical_columns] = X_test[categorical_columns].fillna('missing')

# One-hot encode categorical features
encoder = OneHotEncoder(handle_unknown='ignore')
X_train_encoded = encoder.fit_transform(X_train[categorical_columns])
X_test_encoded = encoder.transform(X_test[categorical_columns])


# Save the encoder
with open('encoder.pkl', 'wb') as file:
    pickle.dump(encoder, file)

# Information about X_train_encoded and X_test_encoded
print("Shape of X_train_encoded:", X_train_encoded.shape)
print("Data type of X_train_encoded:", type(X_train_encoded))
print("Data type of elements in X_train_encoded:", X_train_encoded.dtype)
print("Shape of X_test_encoded:", X_test_encoded.shape)
print("Data type of X_test_encoded:", type(X_test_encoded))
print("Data type of elements in X_test_encoded:", X_test_encoded.dtype)

# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[numerical_columns]) 
X_test_scaled = scaler.transform(X_test[numerical_columns])

# Save scaler
with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

# Print information about X_train_scaled and X_test_scaled
print("Shape of X_train_scaled:", X_train_scaled.shape)
print("Data type of elements in X_train_scaled:", X_train_scaled.dtype)
print("Shape of X_test_scaled:", X_test_scaled.shape)
print("Data type of elements in X_test_scaled:", X_test_scaled.dtype)

Shape of X_train_encoded: (40032, 2195)
Data type of X_train_encoded: <class 'scipy.sparse._csr.csr_matrix'>
Data type of elements in X_train_encoded: float64
Shape of X_test_encoded: (10009, 2195)
Data type of X_test_encoded: <class 'scipy.sparse._csr.csr_matrix'>
Data type of elements in X_test_encoded: float64
Shape of X_train_scaled: (40032, 1)
Data type of elements in X_train_scaled: float64
Shape of X_test_scaled: (10009, 1)
Data type of elements in X_test_scaled: float64


Stage 2

In [16]:
# Define columns
categorical_columns_1 = ['type', 'manufacturer', 'parent_category']
# numerical_columns = ['price']
# text_columns = ['name_normalized', 'description_normalized']

# Fill missing values
X_train_1[categorical_columns_1] = X_train_1[categorical_columns_1].fillna('missing')
X_test_1[categorical_columns_1] = X_test_1[categorical_columns_1].fillna('missing')

# One-hot encode categorical features
encoder_1 = OneHotEncoder(handle_unknown='ignore')
X_train_encoded_1 = encoder_1.fit_transform(X_train_1[categorical_columns_1])
X_test_encoded_1 = encoder_1.transform(X_test_1[categorical_columns_1])

with open('encoder_1.pkl', 'wb') as file:
    pickle.dump(encoder_1, file)



# Print information about X_train_encoded and X_test_encoded
print("Shape of X_train_encoded_1:", X_train_encoded_1.shape)
print("Data type of X_train_encoded_1:", type(X_train_encoded_1))
print("Data type of elements in X_train_encoded_1:", X_train_encoded_1.dtype)
print("Shape of X_test_encoded_1:", X_test_encoded_1.shape)
print("Data type of X_test_encoded_1:", type(X_test_encoded_1))
print("Data type of elements in X_test_encoded_1:", X_test_encoded_1.dtype)

# Scale numerical features
scaler_1 = StandardScaler()
X_train_scaled_1 = scaler_1.fit_transform(X_train_1[numerical_columns])
X_test_scaled_1 = scaler_1.transform(X_test_1[numerical_columns])

with open('scaler_1.pkl', 'wb') as file:
    pickle.dump(scaler_1, file)

# Print information about X_train_scaled and X_test_scaled
print("Shape of X_train_scaled_1:", X_train_scaled_1.shape)
print("Data type of elements in X_train_scaled_1:", X_train_scaled_1.dtype)
print("Shape of X_test_scaled_1:", X_test_scaled_1.shape)
print("Data type of elements in X_test_scaled_1:", X_test_scaled_1.dtype)

Shape of X_train_encoded_1: (40032, 2218)
Data type of X_train_encoded_1: <class 'scipy.sparse._csr.csr_matrix'>
Data type of elements in X_train_encoded_1: float64
Shape of X_test_encoded_1: (10009, 2218)
Data type of X_test_encoded_1: <class 'scipy.sparse._csr.csr_matrix'>
Data type of elements in X_test_encoded_1: float64
Shape of X_train_scaled_1: (40032, 1)
Data type of elements in X_train_scaled_1: float64
Shape of X_test_scaled_1: (10009, 1)
Data type of elements in X_test_scaled_1: float64


Stage 3

In [17]:
# Define columns
categorical_columns_2 = ['type', 'manufacturer', 'parent_category', 'sub_category_1']
# numerical_columns = ['price']
# text_columns = ['name_normalized', 'description_normalized']

# Fill missing values
X_train_2[categorical_columns_2] = X_train_2[categorical_columns_2].fillna('missing')
X_test_2[categorical_columns_2] = X_test_2[categorical_columns_2].fillna('missing')

# One-hot encode categorical features
encoder_2 = OneHotEncoder(handle_unknown='ignore')
X_train_encoded_2 = encoder_2.fit_transform(X_train_2[categorical_columns_2])
X_test_encoded_2 = encoder_2.transform(X_test_2[categorical_columns_2])

with open('encoder_2.pkl', 'wb') as file:
    pickle.dump(encoder_2, file)


# Print information about X_train_encoded and X_test_encoded
print("Shape of X_train_encoded_2:", X_train_encoded_2.shape)
print("Data type of X_train_encoded_2:", type(X_train_encoded_2))
print("Data type of elements in X_train_encoded_2:", X_train_encoded_2.dtype)
print("Shape of X_test_encoded_2:", X_test_encoded_2.shape)
print("Data type of X_test_encoded_2:", type(X_test_encoded_2))
print("Data type of elements in X_test_encoded_2:", X_test_encoded_2.dtype)

# Scale numerical features
scaler_2 = StandardScaler()
X_train_scaled_2 = scaler_2.fit_transform(X_train_2[numerical_columns])
X_test_scaled_2 = scaler_2.transform(X_test_2[numerical_columns])

with open('scaler_2.pkl', 'wb') as file:
    pickle.dump(scaler_2, file)


# Print information about X_train_scaled and X_test_scaled
print("Shape of X_train_scaled_2:", X_train_scaled_2.shape)
print("Data type of elements in X_train_scaled_2:", X_train_scaled_2.dtype)
print("Shape of X_test_scaled_2:", X_test_scaled_2.shape)
print("Data type of elements in X_test_scaled_2:", X_test_scaled_2.dtype)

Shape of X_train_encoded_2: (40032, 2332)
Data type of X_train_encoded_2: <class 'scipy.sparse._csr.csr_matrix'>
Data type of elements in X_train_encoded_2: float64
Shape of X_test_encoded_2: (10009, 2332)
Data type of X_test_encoded_2: <class 'scipy.sparse._csr.csr_matrix'>
Data type of elements in X_test_encoded_2: float64
Shape of X_train_scaled_2: (40032, 1)
Data type of elements in X_train_scaled_2: float64
Shape of X_test_scaled_2: (10009, 1)
Data type of elements in X_test_scaled_2: float64


Stage 4

In [18]:
# Define columns
categorical_columns_3 = ['type', 'manufacturer', 'parent_category', 'sub_category_1', 'sub_category_2']
# numerical_columns = ['price']
# text_columns = ['name_normalized', 'description_normalized']

# Fill missing values
X_train_3[categorical_columns_3] = X_train_3[categorical_columns_3].fillna('missing')
X_test_3[categorical_columns_3] = X_test_3[categorical_columns_3].fillna('missing')

# One-hot encode categorical features
encoder_3 = OneHotEncoder(handle_unknown='ignore')
X_train_encoded_3 = encoder_3.fit_transform(X_train_3[categorical_columns_3])
X_test_encoded_3 = encoder_3.transform(X_test_3[categorical_columns_3])


with open('encoder_3.pkl', 'wb') as file:
    pickle.dump(encoder_3, file)

# Print information about X_train_encoded and X_test_encoded
print("Shape of X_train_encoded_3:", X_train_encoded_3.shape)
print("Data type of X_train_encoded_3:", type(X_train_encoded_3))
print("Data type of elements in X_train_encoded_3:", X_train_encoded_3.dtype)
print("Shape of X_test_encoded_3:", X_test_encoded_3.shape)
print("Data type of X_test_encoded_3:", type(X_test_encoded_3))
print("Data type of elements in X_test_encoded_3:", X_test_encoded_3.dtype)

# Scale numerical features
scaler_3 = StandardScaler()
X_train_scaled_3 = scaler_3.fit_transform(X_train_3[numerical_columns])
X_test_scaled_3 = scaler_3.transform(X_test_3[numerical_columns])

with open('scaler_3.pkl', 'wb') as file:
    pickle.dump(scaler_3, file)


# Print information about X_train_scaled and X_test_scaled
print("Shape of X_train_scaled_3:", X_train_scaled_3.shape)
print("Data type of elements in X_train_scaled_3:", X_train_scaled_3.dtype)
print("Shape of X_test_scaled_3:", X_test_scaled_3.shape)
print("Data type of elements in X_test_scaled_3:", X_test_scaled_3.dtype)

Shape of X_train_encoded_3: (40032, 2681)
Data type of X_train_encoded_3: <class 'scipy.sparse._csr.csr_matrix'>
Data type of elements in X_train_encoded_3: float64
Shape of X_test_encoded_3: (10009, 2681)
Data type of X_test_encoded_3: <class 'scipy.sparse._csr.csr_matrix'>
Data type of elements in X_test_encoded_3: float64
Shape of X_train_scaled_3: (40032, 1)
Data type of elements in X_train_scaled_3: float64
Shape of X_test_scaled_3: (10009, 1)
Data type of elements in X_test_scaled_3: float64


Stage 5

In [19]:
# Define columns
categorical_columns_4 = ['type', 'manufacturer', 'parent_category', 'sub_category_1', 'sub_category_2', 'sub_category_3']
# numerical_columns = ['price']
# text_columns = ['name_normalized', 'description_normalized']

# Fill missing values
X_train_4[categorical_columns_4] = X_train_4[categorical_columns_4].fillna('missing')
X_test_4[categorical_columns_4] = X_test_4[categorical_columns_4].fillna('missing')

# One-hot encode categorical features
encoder_4 = OneHotEncoder(handle_unknown='ignore')
X_train_encoded_4 = encoder_4.fit_transform(X_train_4[categorical_columns_4])
X_test_encoded_4 = encoder_4.transform(X_test_4[categorical_columns_4])

with open('encoder_4.pkl', 'wb') as file:
    pickle.dump(encoder_4, file)

# Print information about X_train_encoded and X_test_encoded
print("Shape of X_train_encoded_4:", X_train_encoded_4.shape)
print("Data type of X_train_encoded_4:", type(X_train_encoded_4))
print("Data type of elements in X_train_encoded_4:", X_train_encoded_4.dtype)
print("Shape of X_test_encoded_4:", X_test_encoded_4.shape)
print("Data type of X_test_encoded_4:", type(X_test_encoded_4))
print("Data type of elements in X_test_encoded_4:", X_test_encoded_4.dtype)

# Scale numerical features
scaler_4 = StandardScaler()
X_train_scaled_4 = scaler_4.fit_transform(X_train_4[numerical_columns])
X_test_scaled_4 = scaler_4.transform(X_test_4[numerical_columns])

with open('scaler_4.pkl', 'wb') as file:
    pickle.dump(scaler_4, file)


# Print information about X_train_scaled and X_test_scaled
print("Shape of X_train_scaled_4:", X_train_scaled_4.shape)
print("Data type of elements in X_train_scaled_4:", X_train_scaled_4.dtype)
print("Shape of X_test_scaled_4:", X_test_scaled_4.shape)
print("Data type of elements in X_test_scaled_4:", X_test_scaled_4.dtype)

Shape of X_train_encoded_4: (40032, 2997)
Data type of X_train_encoded_4: <class 'scipy.sparse._csr.csr_matrix'>
Data type of elements in X_train_encoded_4: float64
Shape of X_test_encoded_4: (10009, 2997)
Data type of X_test_encoded_4: <class 'scipy.sparse._csr.csr_matrix'>
Data type of elements in X_test_encoded_4: float64
Shape of X_train_scaled_4: (40032, 1)
Data type of elements in X_train_scaled_4: float64
Shape of X_test_scaled_4: (10009, 1)
Data type of elements in X_test_scaled_4: float64


In [20]:
from scipy.sparse import hstack

# Stage 1
X_train_processed = hstack([X_train_encoded, X_train_scaled]).astype(np.float32).toarray()
X_test_processed = hstack([X_test_encoded, X_test_scaled]).astype(np.float32).toarray()

# Stage 2
X_train_processed_1 = hstack([X_train_encoded_1, X_train_scaled_1]).astype(np.float32).toarray()
X_test_processed_1 = hstack([X_test_encoded_1, X_test_scaled_1]).astype(np.float32).toarray()

# Stage 3
X_train_processed_2 = hstack([X_train_encoded_2, X_train_scaled_2]).astype(np.float32).toarray()
X_test_processed_2 = hstack([X_test_encoded_2, X_test_scaled_2]).astype(np.float32).toarray()

# Stage 4
X_train_processed_3 = hstack([X_train_encoded_3, X_train_scaled_3]).astype(np.float32).toarray()
X_test_processed_3 = hstack([X_test_encoded_3, X_test_scaled_3]).astype(np.float32).toarray()

# Stage 5
X_train_processed_4 = hstack([X_train_encoded_4, X_train_scaled_4]).astype(np.float32).toarray()
X_test_processed_4 = hstack([X_test_encoded_4, X_test_scaled_4]).astype(np.float32).toarray()



In [21]:
# Dim 1
print("Data type of X_train_processed:", X_train_processed.dtype)
print("Data type of X_test_processed:", X_test_processed.dtype)
print("X_train_processed shape:", X_train_processed.shape)
print("X_test_processed shape:", X_test_processed.shape)

Data type of X_train_processed: float32
Data type of X_test_processed: float32
X_train_processed shape: (40032, 2196)
X_test_processed shape: (10009, 2196)


In [22]:
# Dim 2
print("Data type of X_train_processed_1:", X_train_processed_1.dtype)
print("Data type of X_test_processed_1:", X_test_processed_1.dtype)
print("X_train_processed_1 shape:", X_train_processed_1.shape)
print("X_test_processed_1 shape:", X_test_processed_1.shape)

Data type of X_train_processed_1: float32
Data type of X_test_processed_1: float32
X_train_processed_1 shape: (40032, 2219)
X_test_processed_1 shape: (10009, 2219)


In [23]:
# Dim 3
print("Data type of X_train_processed_2:", X_train_processed_2.dtype)
print("Data type of X_test_processed_2:", X_test_processed_2.dtype)
print("X_train_processed_2 shape:", X_train_processed_2.shape)
print("X_test_processed_2 shape:", X_test_processed_2.shape)

Data type of X_train_processed_2: float32
Data type of X_test_processed_2: float32
X_train_processed_2 shape: (40032, 2333)
X_test_processed_2 shape: (10009, 2333)


In [24]:
# Dim 4
print("Data type of X_train_processed_3:", X_train_processed_3.dtype)
print("Data type of X_test_processed_3:",X_test_processed_3.dtype)
print("X_train_processed_3 shape:", X_train_processed_3.shape)
print("X_test_processed_3 shape:", X_test_processed_3.shape)

Data type of X_train_processed_3: float32
Data type of X_test_processed_3: float32
X_train_processed_3 shape: (40032, 2682)
X_test_processed_3 shape: (10009, 2682)


In [25]:
# Dim 5
print("Data type of X_train_processed_4:", X_train_processed_4.dtype)
print("Data type of X_test_processed_4:", X_test_processed_4.dtype)
print("X_train_processed_4 shape:", X_train_processed_4.shape)
print("X_test_processed_4 shape:", X_test_processed_4.shape)

Data type of X_train_processed_4: float32
Data type of X_test_processed_4: float32
X_train_processed_4 shape: (40032, 2998)
X_test_processed_4 shape: (10009, 2998)


In [26]:
import numpy as np

# Define file paths to save the arrays
file_paths = {
    'X_train_processed.npy': X_train_processed,
    'X_test_processed.npy': X_test_processed,
    'X_train_processed_1.npy': X_train_processed_1,
    'X_test_processed_1.npy': X_test_processed_1,
    'X_train_processed_2.npy': X_train_processed_2,
    'X_test_processed_2.npy': X_test_processed_2,
    'X_train_processed_3.npy': X_train_processed_3,
    'X_test_processed_3.npy': X_test_processed_3,
    'X_train_processed_4.npy': X_train_processed_4,
    'X_test_processed_4.npy': X_test_processed_4
}

# Save each array
for file_name, array in file_paths.items():
    np.save(file_name, array)


In [27]:
#BERT Embeddings, uncomment to generate and save them in current path
#import tensorflow as tf
#from transformers import BertTokenizer, TFBertModel

#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#model = TFBertModel.from_pretrained('bert-base-uncased')

#def tokenize_text(column):
    #if isinstance(column, str):
        #return tokenizer(column, padding=True, truncation=True, return_tensors='tf')
    #elif isinstance(column, list) and all(isinstance(item, str) for item in column):
        #return tokenizer(column, padding=True, truncation=True, return_tensors='tf')
    #else:
        #raise ValueError("Invalid input format.")

#def get_bert_embeddings(text_tokens):
    #outputs = model(text_tokens)
    #return outputs.last_hidden_state.numpy()

#X_train_name_tokens = X_train['name_normalized'].apply(tokenize_text)
#X_train_description_tokens = X_train['description_normalized'].apply(tokenize_text)

#X_test_name_tokens = X_test['name_normalized'].apply(tokenize_text)
#X_test_description_tokens = X_test['description_normalized'].apply(tokenize_text)

#X_train_name_embeddings = X_train_name_tokens.apply(get_bert_embeddings)
#X_train_description_embeddings = X_train_description_tokens.apply(get_bert_embeddings)

#X_test_name_embeddings = X_test_name_tokens.apply(get_bert_embeddings)
#X_test_description_embeddings = X_test_description_tokens.apply(get_bert_embeddings)


# Save the embeddings
# np.save('X_train_name_embeddings.npy', X_train_name_embeddings)
# np.save('X_train_description_embeddings.npy', X_train_description_embeddings)
# np.save('X_test_name_embeddings.npy', X_test_name_embeddings)
# np.save('X_test_description_embeddings.npy', X_test_description_embeddings)

In [28]:
X_train_name_embeddings_loaded = np.load('X_train_name_embeddings.npy', allow_pickle= True)
X_train_description_embeddings_loaded = np.load('X_train_description_embeddings.npy',allow_pickle= True)
X_test_name_embeddings_loaded = np.load('X_test_name_embeddings.npy', allow_pickle= True)
X_test_description_embeddings_loaded = np.load('X_test_description_embeddings.npy', allow_pickle= True)

# Load the saved NumPy arrays
X_train_processed_loaded = np.load('X_train_processed.npy', allow_pickle=True)
X_test_processed_loaded = np.load('X_test_processed.npy', allow_pickle=True)
X_train_processed_1_loaded = np.load('X_train_processed_1.npy', allow_pickle=True)
X_test_processed_1_loaded = np.load('X_test_processed_1.npy', allow_pickle=True)
X_train_processed_2_loaded = np.load('X_train_processed_2.npy', allow_pickle=True)
X_test_processed_2_loaded = np.load('X_test_processed_2.npy', allow_pickle=True)
X_train_processed_3_loaded = np.load('X_train_processed_3.npy', allow_pickle=True)
X_test_processed_3_loaded = np.load('X_test_processed_3.npy', allow_pickle=True)
X_train_processed_4_loaded = np.load('X_train_processed_4.npy', allow_pickle=True)
X_test_processed_4_loaded = np.load('X_test_processed_4.npy', allow_pickle=True)



In [29]:
X_train_name_last_hidden = np.array([x[0][-1] for x in X_train_name_embeddings_loaded])
X_train_description_last_hidden = np.array([x[0][-1] for x in X_train_description_embeddings_loaded])
X_test_name_last_hidden = np.array([x[0][-1] for x in X_test_name_embeddings_loaded])
X_test_description_last_hidden = np.array([x[0][-1] for x in X_test_description_embeddings_loaded])

In [30]:
print("First element of X_train_name_last_hidden:", X_train_name_last_hidden[0])

First element of X_train_name_last_hidden: [ 6.20537519e-01 -1.57028392e-01 -4.39105332e-01  5.87245941e-01
 -4.92567480e-01 -6.75922573e-01  3.66373003e-01 -7.91122556e-01
  6.63730741e-01  4.76211309e-02  4.97472771e-02 -3.77824754e-01
 -5.29451743e-02 -3.52970883e-03 -7.33473897e-01 -2.74465412e-01
 -8.19092095e-02 -1.52533606e-01  7.26244077e-02 -4.86062840e-02
  3.07591200e-01 -1.32735014e-01  8.47112298e-01  2.39423364e-01
  1.34860486e-01  3.51377934e-01 -4.78440404e-01 -1.80202276e-02
 -2.26639926e-01 -3.52599382e-01 -6.21600211e-01 -2.67526209e-01
 -1.92636084e-02  3.99675339e-01  1.55551210e-01 -8.61436129e-02
  3.39647025e-01 -4.60110493e-02 -5.71235001e-01 -4.14561182e-01
 -3.34018916e-01 -1.61173679e-02 -1.93653673e-01  6.32311583e-01
  5.47741950e-02 -6.00057304e-01  6.22461319e-01  3.42181236e-01
 -2.25496083e-01  4.89310294e-01  2.93806463e-01  1.60992384e-01
 -7.31535181e-02  8.67539346e-02  5.76151237e-02  1.21880889e-01
  2.94119507e-01 -4.34998453e-01  1.54902250e-0

In [31]:
print("Shape of X_train_name_last_hidden:", X_train_name_last_hidden.shape)
print("Data type of X_train_name_last_hidden:", X_train_name_last_hidden.dtype)

print("Shape of X_train_description_last_hidden:", X_train_description_last_hidden.shape)
print("Data type of X_train_description_last_hidden:", X_train_description_last_hidden.dtype)

print("Shape of X_test_name_last_hidden:", X_test_name_last_hidden.shape)
print("Data type of X_test_name_last_hidden:", X_test_name_last_hidden.dtype)

print("Shape of X_test_description_last_hidden:", X_test_description_last_hidden.shape)
print("Data type of X_test_description_last_hidden:", X_test_description_last_hidden.dtype)

Shape of X_train_name_last_hidden: (40032, 768)
Data type of X_train_name_last_hidden: float32
Shape of X_train_description_last_hidden: (40032, 768)
Data type of X_train_description_last_hidden: float32
Shape of X_test_name_last_hidden: (10009, 768)
Data type of X_test_name_last_hidden: float32
Shape of X_test_description_last_hidden: (10009, 768)
Data type of X_test_description_last_hidden: float32


In [32]:
X_train_concatenated = np.concatenate((X_train_name_last_hidden, X_train_description_last_hidden), axis=1)
X_test_concatenated = np.concatenate((X_test_name_last_hidden, X_test_description_last_hidden), axis=1)

In [33]:
print("Shape of X_train_concatenated:", X_train_concatenated.shape)
print("Data type of X_train_concatenated:", X_train_concatenated.dtype)

print("Shape of X_test_concatenated:", X_test_concatenated.shape)
print("Data type of X_test_concatenated:", X_test_concatenated.dtype)

Shape of X_train_concatenated: (40032, 1536)
Data type of X_train_concatenated: float32
Shape of X_test_concatenated: (10009, 1536)
Data type of X_test_concatenated: float32


In [34]:
# Stage 1
X_train_combined = np.concatenate((X_train_processed_loaded, X_train_concatenated), axis=1)
X_test_combined = np.concatenate((X_test_processed_loaded, X_test_concatenated), axis=1)

# Stage 2
X_train_combined_1 = np.concatenate((X_train_processed_1_loaded, X_train_concatenated), axis=1)
X_test_combined_1 = np.concatenate((X_test_processed_1_loaded, X_test_concatenated), axis=1)

# Stage 3
X_train_combined_2 = np.concatenate((X_train_processed_2_loaded, X_train_concatenated), axis=1)
X_test_combined_2 = np.concatenate((X_test_processed_2_loaded, X_test_concatenated), axis=1)

# Stage 4
X_train_combined_3 = np.concatenate((X_train_processed_3_loaded, X_train_concatenated), axis=1)
X_test_combined_3 = np.concatenate((X_test_processed_3_loaded, X_test_concatenated), axis=1)

# Stage 5
X_train_combined_4 = np.concatenate((X_train_processed_4_loaded, X_train_concatenated), axis=1)
X_test_combined_4 = np.concatenate((X_test_processed_4_loaded, X_test_concatenated), axis=1)


In [35]:
print("Stage 1 - Train combined shape:", X_train_combined.shape)
print("Stage 1 - Test combined shape:", X_test_combined.shape)
print("Stage 2 - Train combined shape:", X_train_combined_1.shape)
print("Stage 2 - Test combined shape:", X_test_combined_1.shape)
print("Stage 3 - Train combined shape:", X_train_combined_2.shape)
print("Stage 3 - Test combined shape:", X_test_combined_2.shape)
print("Stage 4 - Train combined shape:", X_train_combined_3.shape)
print("Stage 4 - Test combined shape:", X_test_combined_3.shape)
print("Stage 5 - Train combined shape:", X_train_combined_4.shape)
print("Stage 5 - Test combined shape:", X_test_combined_4.shape)

Stage 1 - Train combined shape: (40032, 3732)
Stage 1 - Test combined shape: (10009, 3732)
Stage 2 - Train combined shape: (40032, 3755)
Stage 2 - Test combined shape: (10009, 3755)
Stage 3 - Train combined shape: (40032, 3869)
Stage 3 - Test combined shape: (10009, 3869)
Stage 4 - Train combined shape: (40032, 4218)
Stage 4 - Test combined shape: (10009, 4218)
Stage 5 - Train combined shape: (40032, 4534)
Stage 5 - Test combined shape: (10009, 4534)


In [36]:
#Define num_clases

num_classes = len(np.unique(y_encoded))
num_classes_1 = len(np.unique(y_1_encoded))
num_classes_2 = len(np.unique(y_2_encoded))
num_classes_3 = len(np.unique(y_3_encoded))
num_classes_4 = len(np.unique(y_4_encoded))


In [37]:
print(num_classes)
print(num_classes_1)
print(num_classes_2)
print(num_classes_3)
print(num_classes_4)

23
114
349
316
162


In [None]:
#Stage1
from tensorflow.keras import models, layers


from tensorflow.keras import callbacks
from tensorflow.keras.models import save_model
from sklearn.metrics import classification_report

# Define learning rate function
def lr_scheduler(epoch, lr):
    if epoch < 4:
        return lr * 0.94
    elif epoch < 8:
        return lr * 0.9
    elif epoch < 16: 
        return lr * 0.80
    elif epoch < 19:
        return lr * 0.70
    else:
        return lr * 0.6

# Define common callbacks
early_stopping = callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
lr_scheduler_callback = callbacks.LearningRateScheduler(lr_scheduler)

# Compile the model
model = models.Sequential([
    layers.Dense(128, activation='relu', input_shape=(X_train_combined.shape[1],)),
    layers.BatchNormalization(),
    layers.Dropout(0.5),
    layers.Dense(64, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.5),
    layers.Dense(num_classes, activation='softmax')
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Model fit
run_1 = model.fit(X_train_combined, y_train, epochs=60, batch_size=32,
                  validation_data=(X_test_combined, y_test),
                  callbacks=[early_stopping, lr_scheduler_callback])

save_model(model, 'model_1_preberttune.h5')

y_pred_probabilities = model.predict(X_test_combined)
y_pred = np.argmax(y_pred_probabilities, axis=1)

loss, accuracy = model.evaluate(X_test_combined, y_test)
print("Accuracy:", accuracy)

report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)


In [None]:
#Stage2
# Define learning rate function
def lr_scheduler(epoch, lr):
    if epoch < 4:
        return lr * 0.94
    elif epoch < 8:
        return lr * 0.9
    elif epoch < 16: 
        return lr * 0.80
    elif epoch < 19:
        return lr * 0.70
    else:
        return lr * 0.6

# Define common callbacks
early_stopping = callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
lr_scheduler_callback = callbacks.LearningRateScheduler(lr_scheduler)

# Create model
model_2 = models.Sequential([
    layers.Dense(128, activation='relu', input_shape=(X_train_combined_1.shape[1],)),
    layers.BatchNormalization(),
    layers.Dropout(0.5),
    layers.Dense(64, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.5),
    layers.Dense(num_classes_1, activation='softmax')
])

# Compile model
model_2.compile(optimizer='adam',
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])

# Fit model
history_2 = model_2.fit(X_train_combined_1, y_train_1, epochs=60, batch_size=32,
                        validation_data=(X_test_combined_1, y_test_1),
                        callbacks=[early_stopping, lr_scheduler_callback])

# Save model
save_model(model_2, 'model_2_preberttune.h5')

# Evaluate model
loss_2, accuracy_2 = model_2.evaluate(X_test_combined_1, y_test_1)
print("Accuracy for Stage 2 model:", accuracy_2)

# Generate classification report
y_pred_probabilities_2 = model_2.predict(X_test_combined_1)
y_pred_2 = np.argmax(y_pred_probabilities_2, axis=1)
report_2 = classification_report(y_test_1, y_pred_2)
print("Classification Report for Stage 2 model:")
print(report_2)



In [None]:
# Stage 3

def lr_scheduler(epoch, lr):
    if epoch < 4:
        return lr * 0.94
    elif epoch < 8:
        return lr * 0.9
    elif epoch < 16: 
        return lr * 0.85
    elif epoch < 19:
        return lr * 0.75
    else:
        return lr * 0.7

# Define common callbacks
early_stopping = callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
lr_scheduler_callback = callbacks.LearningRateScheduler(lr_scheduler)


model_3 = models.Sequential([
    layers.Dense(128, activation='relu', input_shape=(X_train_combined_2.shape[1],)),
    layers.BatchNormalization(),
    layers.Dropout(0.5),
    layers.Dense(64, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.5),
    layers.Dense(num_classes_2, activation='softmax')
])

# Compile model
model_3.compile(optimizer='adam',
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])

# Fit model
history_3 = model_3.fit(X_train_combined_2, y_train_2, epochs=60, batch_size=32,
                        validation_data=(X_test_combined_2, y_test_2),
                        callbacks=[early_stopping, lr_scheduler_callback])

# Save model
save_model(model_3, 'model_3_preberttune.h5')

# Evaluate model
loss_3, accuracy_3 = model_3.evaluate(X_test_combined_2, y_test_2)
print("Accuracy for Stage 3 model:", accuracy_3)

# Generate classification report
y_pred_probabilities_3 = model_3.predict(X_test_combined_2)
y_pred_3 = np.argmax(y_pred_probabilities_3, axis=1)
report_3 = classification_report(y_test_2, y_pred_3)
print("Classification Report for Stage 3 model:")
print(report_3)



In [None]:
# Stage 4
def lr_scheduler(epoch, lr):
    if epoch < 4:
        return lr * 0.94
    elif epoch < 8:
        return lr * 0.9
    elif epoch < 16: 
        return lr * 0.80
    elif epoch < 19:
        return lr * 0.70
    else:
        return lr * 0.6

# Define common callbacks
early_stopping = callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
lr_scheduler_callback = callbacks.LearningRateScheduler(lr_scheduler)

# Create model
model_4 = models.Sequential([
    layers.Dense(128, activation='relu', input_shape=(X_train_combined_3.shape[1],)),
    layers.BatchNormalization(),
    layers.Dropout(0.5),
    layers.Dense(64, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.5),
    layers.Dense(num_classes_3, activation='softmax')
])

# Compile model
model_4.compile(optimizer='adam',
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])

# Fit model
history_4 = model_4.fit(X_train_combined_3, y_train_3, epochs=60, batch_size=32,
                        validation_data=(X_test_combined_3, y_test_3),
                        callbacks=[early_stopping, lr_scheduler_callback])

# Save model
save_model(model_4, 'model_4_preberttune.h5')

# Evaluate model
loss_4, accuracy_4 = model_4.evaluate(X_test_combined_3, y_test_3)
print("Accuracy for Stage 4 model:", accuracy_4)

# Generate classification report
y_pred_probabilities_4 = model_4.predict(X_test_combined_3)
y_pred_4 = np.argmax(y_pred_probabilities_4, axis=1)
report_4 = classification_report(y_test_3, y_pred_4)
print("Classification Report for Stage 4 model:")
print(report_4)


In [None]:
# Stage 5
print("Running Stage 5 model...")
def lr_scheduler(epoch, lr):
    if epoch < 4:
        return lr * 0.94
    elif epoch < 8:
        return lr * 0.9
    elif epoch < 16: 
        return lr * 0.80
    elif epoch < 19:
        return lr * 0.70
    else:
        return lr * 0.6

# Define common callbacks
early_stopping = callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
lr_scheduler_callback = callbacks.LearningRateScheduler(lr_scheduler)
# Create model
model_5 = models.Sequential([
    layers.Dense(128, activation='relu', input_shape=(X_train_combined_4.shape[1],)),
    layers.BatchNormalization(),
    layers.Dropout(0.5),
    layers.Dense(64, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.5),
    layers.Dense(num_classes_4, activation='softmax')
])

# Compile model
model_5.compile(optimizer='adam',
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])

# Fit model
history_5 = model_5.fit(X_train_combined_4, y_train_4, epochs=60, batch_size=32,
                        validation_data=(X_test_combined_4, y_test_4),
                        callbacks=[early_stopping, lr_scheduler_callback])

# Save model
save_model(model_5, 'model_5_preberttune.h5')

# Evaluate model
loss_5, accuracy_5 = model_5.evaluate(X_test_combined_4, y_test_4)
print("Accuracy for Stage 5 model:", accuracy_5)

# Generate classification report
y_pred_probabilities_5 = model_5.predict(X_test_combined_4)
y_pred_5 = np.argmax(y_pred_probabilities_5, axis=1)
report_5 = classification_report(y_test_4, y_pred_5)
print("Classification Report for Stage 5 model:")
print(report_5)
