# Topic Number selection for HTTP Content!

In [1]:
import tensorflow as tf
import os,datetime
import pandas as pd
import numpy as np
import pickle
import string

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.preprocessing import OneHotEncoder

import gensim
from gensim import corpora
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel


import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import matplotlib.pyplot as plt
import category_encoders as ce

import multiprocessing as mp
from thinc.api import set_gpu_allocator, require_gpu
from joblib import Parallel, delayed
from itertools import cycle
import cupy
import spacy

2024-10-18 22:17:39.902189: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Download some useful packages
!python3 -m spacy download en_core_web_sm

# Download NLTK data files
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m46.5 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


[nltk_data] Downloading package stopwords to
[nltk_data]     /homes/01/bxbhusal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /homes/01/bxbhusal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /homes/01/bxbhusal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Declare some userful variables
num_visible_rows=4

## Take HTTP data as input!

In [4]:
df = pd.read_csv("./r4.2/http.csv", header=0, nrows=100000, skiprows=(1,500000))
df['date'] = pd.to_datetime(df['date'])
df

Unnamed: 0,id,date,user,pc,url,content
0,{Q5R1-T3EF87UE-2395RWZS},2010-01-02 07:00:13,NGF0157,PC-6056,http://urbanspoon.com/Plunketts_Creek_Loyalsoc...,festival off northwards than congestion partne...
1,{X9O1-O0XW52VO-5806RPHG},2010-01-02 07:03:46,NGF0157,PC-6056,http://aa.com/Rhodocene/rhodocenium/fhaavatqrf...,long away reorganized baldwin seth business 18...
2,{G5S8-U5OG04TE-5299CCTU},2010-01-02 07:05:26,IRM0931,PC-7188,http://groupon.com/Leonhard_Euler/leonhard/tne...,among german schwein experimental becomes prev...
3,{L0R4-A9DH29VP-4553AUWM},2010-01-02 07:05:52,IRM0931,PC-7188,http://flickr.com/Inauguration_of_Barack_Obama...,kate criteria j 2008 highest 12 include books ...
4,{U7D0-K8FF04MI-4691ZHYP},2010-01-02 07:05:55,IRM0931,PC-7188,http://skype.com/William_D_Boyce/lsa/onpxcnpxp...,feed commonly reef frogs years replaced walter...
...,...,...,...,...,...,...
99995,{P4C0-H4AC44QZ-5898ATQE},2010-01-05 08:24:12,BBS0039,PC-9436,http://stubhub.com/Hoover_Dam/ickes/zbgbeplpyr...,an unknown afternoon dietary state law nationa...
99996,{U9Y2-H0WW90NF-1184ROHW},2010-01-05 08:24:12,KAL0395,PC-0004,http://tigerdirect.com/European_Commission/bar...,begin top we band themselves harshly or fourth...
99997,{Y2Y4-T1KW57MX-2017DWRE},2010-01-05 08:24:12,LDB0090,PC-6824,http://microsoft.com/Meteorological_history_of...,until acquire flared get secondary minas sea u...
99998,{X2D5-E5UH86FZ-2678DVEF},2010-01-05 08:24:12,WTF0387,PC-6159,http://sidereel.com/Miniopterus_griveaudi/griv...,form air rely conference quickly set expected ...


## Extract features from Date!
The user logs are at different times. I divided the time into **4** different time frames.
- 0 = 12AM - 6AM
- 1 = 6AM - 12PM
- 2 = 12PM - 6PM
- 3 = 6PM - 12AM
Therefore, a new feature **time_frame** is made. Date is decomposed into 3 other numerical features: `day`,`month`, and `year`. Finally, date feature is dropped.

After dividng them into 4 different time frames in order to apply one-hot encoding.

**Why One-Hot?**
  - **No Ordinality:** Each time frame is represented independently without implying any order.
  - **Clarity:** Clearly distinguishes between different time frames.


In [5]:
# Define the function to categorize time frames
def categorize_time_frame(hour):
    if 0 <= hour < 6:
        return 0
    elif 6 <= hour < 12:
        return 1
    elif 12 <= hour < 18:
        return 2
    else:
        return 3

# Apply the function to create the 'time_frame' column
new_df = df.copy()
new_df['time_frame'] = df['date'].dt.hour.apply(categorize_time_frame)
new_df['day'] = df['date'].dt.day
new_df['month'] = df['date'].dt.month
new_df['year'] = df['date'].dt.year

new_df=new_df.drop(columns="date")
new_df=new_df.drop(columns="id")

# Applying one-hot encoding

# Initialize OneHotEncoder
ohe = OneHotEncoder(sparse_output=False)
time_encoded = ohe.fit_transform(new_df[['time_frame']])

# Create DataFrame with One-Hot Encoded Columns
time_encoded_df = pd.DataFrame(time_encoded.astype(int), columns=ohe.get_feature_names_out(['time_frame']))

# Concatenate with Original DataFrame
new_df = pd.concat([new_df, time_encoded_df], axis=1).drop('time_frame', axis=1)

new_df.head()

Unnamed: 0,user,pc,url,content,day,month,year,time_frame_1,time_frame_2,time_frame_3
0,NGF0157,PC-6056,http://urbanspoon.com/Plunketts_Creek_Loyalsoc...,festival off northwards than congestion partne...,2,1,2010,1,0,0
1,NGF0157,PC-6056,http://aa.com/Rhodocene/rhodocenium/fhaavatqrf...,long away reorganized baldwin seth business 18...,2,1,2010,1,0,0
2,IRM0931,PC-7188,http://groupon.com/Leonhard_Euler/leonhard/tne...,among german schwein experimental becomes prev...,2,1,2010,1,0,0
3,IRM0931,PC-7188,http://flickr.com/Inauguration_of_Barack_Obama...,kate criteria j 2008 highest 12 include books ...,2,1,2010,1,0,0
4,IRM0931,PC-7188,http://skype.com/William_D_Boyce/lsa/onpxcnpxp...,feed commonly reef frogs years replaced walter...,2,1,2010,1,0,0


# Encoding Categorical Features: Binary Encoding vs. Label Encoding
In this part, I am encoding USER and PC column into numerical features.

## **Why Choose Binary Encoding Over Label Encoding?**

- **Avoids Artificial Ordinality:** Unlike Label Encoding, Binary Encoding doesn't introduce an arbitrary numerical order among categories, preventing the model from misinterpreting relationships.
- **Dimensionality Efficiency:** Binary Encoding transforms categories into a compact binary format, significantly reducing the number of features compared to One-Hot Encoding while preserving uniqueness.
- **Scalability:** Handles high-cardinality features (e.g., 1,000 unique users) efficiently without creating thousands of columns.

## **How They Work**

### **Label Encoding**

Assigns each unique category a distinct integer.

| Category | Label Encoded |
|----------|---------------|
| User1    | 0             |
| User2    | 1             |
| User3    | 2             |
| ...      | ...           |
| User1000 | 999           |

**Pros:**
- Simple and easy to implement.
- Low dimensionality (single column).

**Cons:**
- Introduces artificial ordinality.
- May mislead models to infer unintended relationships.

### **Binary Encoding**

Converts each category to its binary representation spread across multiple binary columns.

| Category | Binary Encoded (10 bits) |
|----------|--------------------------|
| User1    | 0000000000               |
| User2    | 0000000001               |
| User3    | 0000000010               |
| ...      | ...                      |
| User1000 | 1111100111               |

**Pros:**
- Avoids artificial ordinality.
- More compact than One-Hot Encoding (~10 columns for 1,000 categories).
- Preserves uniqueness of categories.

**Cons:**
- Slightly more complex implementation.
- Binary features are less interpretable individually.

## **Conclusion**

**Binary Encoding** offers a balanced approach for high-cardinality categorical features by maintaining uniqueness without inflating dimensionality or introducing artificial order, making it a superior choice over **Label Encoding** for scenarios like representing 1,000 unique users.

In [6]:
# Binary Encoder for user
user_binary_encoder = ce.BinaryEncoder(cols=['user'])

# Fit and transform the 'user' column
new_df = user_binary_encoder.fit_transform(new_df)

#At last, save the encoder for further use.
pkl_user_encoder_output = open("user_encoder.pkl",'wb')
pickle.dump(user_binary_encoder, pkl_user_encoder_output)

In [7]:
# Binary Encoder for PC
pc_binary_encoder = ce.BinaryEncoder(cols=['pc'])

# Fit and transform the 'pc' column
new_df = pc_binary_encoder.fit_transform(new_df)

#At last, save the encoder for further use.
pkl_pc_encoder_output = open("pc_encoder.pkl",'wb')
pickle.dump(pc_binary_encoder, pkl_pc_encoder_output)

new_df.head()

Unnamed: 0,user_0,user_1,user_2,user_3,user_4,user_5,user_6,user_7,user_8,user_9,...,pc_8,pc_9,url,content,day,month,year,time_frame_1,time_frame_2,time_frame_3
0,0,0,0,0,0,0,0,0,0,1,...,0,1,http://urbanspoon.com/Plunketts_Creek_Loyalsoc...,festival off northwards than congestion partne...,2,1,2010,1,0,0
1,0,0,0,0,0,0,0,0,0,1,...,0,1,http://aa.com/Rhodocene/rhodocenium/fhaavatqrf...,long away reorganized baldwin seth business 18...,2,1,2010,1,0,0
2,0,0,0,0,0,0,0,0,1,0,...,1,0,http://groupon.com/Leonhard_Euler/leonhard/tne...,among german schwein experimental becomes prev...,2,1,2010,1,0,0
3,0,0,0,0,0,0,0,0,1,0,...,1,0,http://flickr.com/Inauguration_of_Barack_Obama...,kate criteria j 2008 highest 12 include books ...,2,1,2010,1,0,0
4,0,0,0,0,0,0,0,0,1,0,...,1,0,http://skype.com/William_D_Boyce/lsa/onpxcnpxp...,feed commonly reef frogs years replaced walter...,2,1,2010,1,0,0


In [8]:
# # Initialize stopwords, lemmatizer, and punctuation
# stop_words = set(stopwords.words('english'))
# lemmatizer = WordNetLemmatizer()
# punctuations = string.punctuation

# spacy.require_gpu()

# # Load spaCy's English model
# nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# def preprocess_text(text):
#     # Convert to lowercase
#     text = text.lower()
    
#     # Remove punctuation
#     text = text.translate(str.maketrans('', '', punctuations))
    
#     # Tokenize using spaCy for efficient lemmatization
#     doc = nlp(text)
    
#     # Lemmatize and remove stopwords and non-alphabetic tokens
#     tokens = [token.lemma_ for token in doc if token.lemma_.isalpha() and token.lemma_ not in stop_words]
    
#     return tokens

# # Apply preprocessing to the 'content' column
# new_df['tokens'] = new_df['content'].apply(preprocess_text)

# # Display the tokens
# new_df

In [9]:
# num_gpus = len(tf.config.list_physical_devices('GPU'))

# def preprocess_texts(texts, gpu_id):
#     """
#     Preprocess a list of texts using spaCy on a specific GPU.
    
#     Args:
#         texts (list): List of text strings to preprocess.
#         gpu_id (int): GPU ID to assign to this process.
        
#     Returns:
#         list: List of token lists.
#     """
#     # Assign the process to a specific GPU
#     spacy.require_gpu(gpu_id)
    
    
#     # Load spaCy model with GPU
#     nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    
#     # Initialize tools
#     stop_words = set(stopwords.words('english'))
#     punctuations = string.punctuation
    
#     processed_texts = []
#     for doc in nlp.pipe(texts, batch_size=1000):
#         tokens = [
#             token.lemma_ for token in doc 
#             if token.lemma_.isalpha() and token.lemma_ not in stop_words
#         ]
#         processed_texts.append(tokens)
    
#     return processed_texts


# def split_data(texts, num_chunks):
#     """
#     Split the list of texts into specified number of chunks.
    
#     Args:
#         texts (list): List of text strings.
#         num_chunks (int): Number of chunks to split into.
        
#     Returns:
#         list: List containing chunks of texts.
#     """
#     chunk_size = len(texts) // num_chunks
#     return [texts[i*chunk_size : (i+1)*chunk_size] for i in range(num_chunks-1)] + [texts[(num_chunks-1)*chunk_size:]]

# # Convert the text into an array of texts
# texts = new_df['content'].tolist()

# chunks = split_data(texts, num_gpus)

# pool = mp.Pool(processes=num_gpus)

# # Prepare arguments for each process
# args = [(chunk, gpu_id) for gpu_id, chunk in enumerate(chunks)]

# # Execute preprocessing in parallel
# results = pool.starmap(preprocess_texts, args)

# # Close the pool
# pool.close()
# pool.join()

In [10]:
# Set multiprocessing start method
# mp.set_start_method('spawn', force=True)

In [9]:
def chunker(iterable, total_length, chunksize):
    """Yield successive chunks from iterable."""
    return (iterable[pos: pos + chunksize] for pos in range(0, total_length, chunksize))

def flatten(list_of_lists):
    """Flatten a list of lists into a single list."""
    return [item for sublist in list_of_lists for item in sublist]

def process_entity(doc):
    """Process a spaCy document to extract relevant tokens."""
    super_word_ls = []
    for s in doc.sents:
        word_ls = []
        for t in s:
            if not t.ent_type_:
                if t.text.strip() != "":
                    word_ls.append(t.text)
            else:
                word_ls.append(t.ent_type_)
        if len(word_ls) > 0:
            super_word_ls.append(" ".join(word_ls))
    return " ".join(super_word_ls)

def process_chunk(texts, rank):
    """
    Process a chunk of texts on a specific GPU.

    Args:
        texts (list): List of text strings to process.
        rank (int): GPU ID to assign to this process.

    Returns:
        list: List of processed text strings.
    """
    print(f"Processing on GPU {rank}")
    with cupy.cuda.Device(rank):
        set_gpu_allocator("pytorch")
        require_gpu(rank)
        # Load the transformer-based spaCy model
        nlp = spacy.load("en_core_web_trf")
        preproc_pipe = []
        for doc in nlp.pipe(texts, batch_size=20):
            preproc_pipe.append(process_entity(doc))
        return preproc_pipe



def preprocess_parallel(texts, chunksize=100):
    """
    Parallelize text preprocessing across multiple GPUs.

    Args:
        texts (list): List of text strings to preprocess.
        chunksize (int): Number of texts per chunk.

    Returns:
        list: List of all processed text strings.
    """
    num_gpus = cupy.cuda.runtime.getDeviceCount()
    gpus = list(range(num_gpus))
    rank_cycle = cycle(gpus)

    
    # Split texts into chunks
    chunks = list(chunker(texts, len(texts), chunksize))
    
    # Assign each chunk to a GPU in a round-robin fashion
    tasks = [(chunk, next(rank_cycle)) for chunk in chunks]

    print(len(tasks))
    
    # Use joblib's Parallel to process chunks in parallel
    results = Parallel(n_jobs=num_gpus, backend='multiprocessing')(
        delayed(process_chunk)(chunk, rank) for chunk, rank in tasks
    )
    
    # Flatten the list of results
    return flatten(results)


# Convert the text into an array of texts
texts = new_df['content'].tolist()

# Preprocess the texts using both GPUs
processed_texts = preprocess_parallel(texts, chunksize=50)



2000
Processing on GPU 0Processing on GPU 1

Processing on GPU 0Processing on GPU 1



CUDARuntimeError: cudaErrorInitializationError: initialization error

In [22]:
# A different method for flattening a list
def flatten2d(list2d):
    from functools import reduce
    from operator import iconcat
    return reduce(iconcat, list2d, [])

def chunker(iterator, length, chunksize):
    return (iterator[pos: pos + chunksize] for pos in range(0, length, chunksize))

def process_entity(doc):
    # I need lists of sentences for my use case,  but you could do other processing
    return [s.text for s in doc.sents]

def process_chunk(docs, rank):
    with cupy.cuda.Device(rank):
        set_gpu_allocator('pytorch')
        require_gpu(rank)
        nlp = spacy.load('en_core_web_sm', disable=['parser'])
        nlp.add_pipe('sentencizer')
        preprocess_pipe = []
        for doc in nlp.pipe(docs, batch_size=20):
            preprocess_pipe.append(process_entity(doc))
        rank += 1
        return preprocess_pipe

def process_parallel(docs, jobs=2, chunksize=50):
    executor = Parallel(n_jobs=jobs, prefer='threads')
    
    do = delayed(process_chunk)
    
    tasks = []
    gpus = list(range(0, cupy.cuda.runtime.getDeviceCount()))
    
    rank = 0
    
    for chunk in chunker(docs, len(docs), chunksize):
        tasks.append(do(chunk, rank))
        rank = (rank + 1) % len(gpus)

    
    result = executor(tasks)
    return flatten2d(result)


preprocessed = process_parallel(docs = ["This is a basic sentence. This is another one."]*100, jobs=4, chunksize=25)

['This is a basic sentence. This is another one.', 'This is a basic sentence. This is another one.', 'This is a basic sentence. This is another one.', 'This is a basic sentence. This is another one.', 'This is a basic sentence. This is another one.', 'This is a basic sentence. This is another one.', 'This is a basic sentence. This is another one.', 'This is a basic sentence. This is another one.', 'This is a basic sentence. This is another one.', 'This is a basic sentence. This is another one.', 'This is a basic sentence. This is another one.', 'This is a basic sentence. This is another one.', 'This is a basic sentence. This is another one.', 'This is a basic sentence. This is another one.', 'This is a basic sentence. This is another one.', 'This is a basic sentence. This is another one.', 'This is a basic sentence. This is another one.', 'This is a basic sentence. This is another one.', 'This is a basic sentence. This is another one.', 'This is a basic sentence. This is another one.',

In [12]:
preprocessed

[['This is a basic sentence.', 'This is another one.'],
 ['This is a basic sentence.', 'This is another one.'],
 ['This is a basic sentence.', 'This is another one.'],
 ['This is a basic sentence.', 'This is another one.'],
 ['This is a basic sentence.', 'This is another one.'],
 ['This is a basic sentence.', 'This is another one.'],
 ['This is a basic sentence.', 'This is another one.'],
 ['This is a basic sentence.', 'This is another one.'],
 ['This is a basic sentence.', 'This is another one.'],
 ['This is a basic sentence.', 'This is another one.'],
 ['This is a basic sentence.', 'This is another one.'],
 ['This is a basic sentence.', 'This is another one.'],
 ['This is a basic sentence.', 'This is another one.'],
 ['This is a basic sentence.', 'This is another one.'],
 ['This is a basic sentence.', 'This is another one.'],
 ['This is a basic sentence.', 'This is another one.'],
 ['This is a basic sentence.', 'This is another one.'],
 ['This is a basic sentence.', 'This is another 