# Ads Analysis

Author: Anish Deshpande
Date: 2024-12-19


In [None]:
# import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import warnings
import gc
warnings.filterwarnings('ignore')

# Memory monitoring for Google Colab
def check_memory():
    """Check current memory usage"""
    import psutil
    memory = psutil.virtual_memory()
    print(f"Memory Usage: {memory.percent}% ({memory.used / 1024**3:.1f}GB / {memory.total / 1024**3:.1f}GB)")
    return memory.percent

# Check initial memory
print("Initial memory status:")
check_memory()


# Read in Data


In [None]:
# read in training data:
print("Loading training data...")
ads_data_train = pd.read_csv("../Data/train/train_data_ads.csv")
feeds_data_train = pd.read_csv("../Data/train/train_data_feeds.csv")
print("Loading test data...")
ads_data_test = pd.read_csv("../Data/test/test_data_ads.csv")
feeds_data_test = pd.read_csv("../Data/test/test_data_feeds.csv")
print("Loading codebooks...")
ads_codebook = pd.read_csv("../Data/codebooks/ads_domain_description.csv")
feeds_codebook = pd.read_csv("../Data/codebooks/feeds_domain_description.csv")

print("\nData loaded successfully!")
print(f"Ads train: {ads_data_train.shape}")
print(f"Feeds train: {feeds_data_train.shape}")
print(f"Ads test: {ads_data_test.shape}")
print(f"Feeds test: {feeds_data_test.shape}")

# Check memory after loading
print("\nMemory after data loading:")
check_memory()


: 

# Data cleaning / Preprocessing:


In [None]:
# Memory-efficient conversion for large datasets (Google Colab optimized)
def convert_list_columns_chunked(df, cols, chunk_size=100000):
    """
    Convert string columns with '^' separators into lists of numeric values
    Uses chunked processing to avoid memory issues
    """
    print(f"Processing {len(df)} rows in chunks of {chunk_size}...")
    
    # Process in chunks to avoid memory issues
    chunks = []
    for i in range(0, len(df), chunk_size):
        chunk = df.iloc[i:i+chunk_size].copy()
        
        for col in cols:
            if col in chunk.columns:
                chunk[col] = chunk[col].astype(str).apply(lambda x: 
                    [float(val) if val.replace('.', '').replace('-', '').isdigit() else val 
                     for val in x.split('^') if val.strip() != '']
                    if pd.notna(x) and x != 'nan' else [])
        
        chunks.append(chunk)
        print(f"Processed chunk {i//chunk_size + 1}/{(len(df)-1)//chunk_size + 1}")
    
    return pd.concat(chunks, ignore_index=True)

# specify the columns you want to process
cols_to_convert = [
    "ad_click_list_v001", "ad_click_list_v002", "ad_click_list_v003",
    "ad_close_list_v001", "ad_close_list_v002", "ad_close_list_v003",
    "u_newsCatInterestsST"
]

print("Converting ads training data...")
ads_data_train = convert_list_columns_chunked(ads_data_train, cols_to_convert)
print("Converting ads test data...")
ads_data_test = convert_list_columns_chunked(ads_data_test, cols_to_convert)


In [None]:
# now do the same to the feeds training and testing data:
cols_to_convert = ["u_newsCatInterests", "u_newsCatInterestsST", "u_click_ca2_news", "i_entities"]

print("Converting feeds training data...")
feeds_data_train = convert_list_columns_chunked(feeds_data_train, cols_to_convert)
print("Converting feeds test data...")
feeds_data_test = convert_list_columns_chunked(feeds_data_test, cols_to_convert)


## Define functions for frequency encoding and one hot encoding:


In [None]:
def frequency_encoder(df, cols, normalize=True):
    """
    Encode categorical variables using frequency encoding
    """
    df_copy = df.copy()
    
    for col in cols:
        if col in df_copy.columns:
            # compute frequency table
            freq_counts = df_copy[col].value_counts()
            
            if normalize:
                freq_map = freq_counts / len(df_copy)
            else:
                freq_map = freq_counts
            
            # replace original column with frequency
            df_copy[col] = df_copy[col].map(freq_map)
    
    return df_copy

def one_hot_encoder(df, cols):
    """
    Encode categorical variables using one-hot encoding
    """
    df_copy = df.copy()
    
    for col in cols:
        if col in df_copy.columns:
            # get unique values
            uniq_vals = df_copy[col].unique()
            
            # create new columns for each unique value
            for val in uniq_vals:
                new_col = f"{col}_{val}"
                df_copy[new_col] = (df_copy[col] == val).astype(int)
            
            # remove original column
            df_copy = df_copy.drop(columns=[col])
    
    return df_copy


## Summarize user preferences in feeds data


In [None]:
def summarize_feeds_user_features(feeds_df):
    """
    Summarize feeds data by user to extract user preferences
    """
    
    def top_n_from_list(lst, n=5):
        """Helper to extract top N most frequent values from a list column"""
        if len(lst) == 0 or pd.isna(lst):
            return []
        
        # Flatten the list of vectors into one vector
        flat_vals = []
        for item in lst:
            if isinstance(item, list):
                flat_vals.extend(item)
            else:
                flat_vals.append(item)
        
        flat_vals = [x for x in flat_vals if pd.notna(x)]
        
        if len(flat_vals) == 0:
            return []
        
        # Count frequencies
        from collections import Counter
        freq_counts = Counter(flat_vals)
        return [item for item, count in freq_counts.most_common(n)]
    
    # Summarize by user
    feeds_summary = feeds_df.groupby('u_userId').agg({
        'u_phonePrice': 'mean',
        'u_browserLifeCycle': 'mean',
        'u_refreshTimes': 'mean',
        'u_newsCatInterests': lambda x: top_n_from_list(x.tolist(), 5),
        'u_newsCatDislike': lambda x: top_n_from_list(x.tolist(), 5),
        'u_newsCatInterestsST': lambda x: top_n_from_list(x.tolist(), 5),
        'u_click_ca2_news': lambda x: top_n_from_list(x.tolist(), 5),
        'label': 'mean',
        'cillabel': 'mean',
        'u_userId': 'count'  # feed_count
    }).rename(columns={'u_userId': 'feed_count'})
    
    return feeds_summary.reset_index()


In [None]:
# merge feeds summary data into ads training and testing data
feeds_summary = summarize_feeds_user_features(feeds_data_train)

ads_train_enriched = ads_data_train.merge(feeds_summary,
                                         left_on='user_id', right_on='u_userId', 
                                         how='left')


# Fix data more by getting rid of vector columns:


In [None]:
from tqdm import tqdm
import gc  # Garbage collection for memory management

# --- Identify list-type columns ---
list_cols = [col for col in ads_train_enriched.columns 
             if ads_train_enriched[col].apply(lambda x: isinstance(x, list)).any()]
print(f"List columns detected: {', '.join(list_cols)}")

def extract_fixed(x, n=5):
    """Extract up to n elements from each list cell"""
    if pd.isna(x) or x is None or len(x) == 0:
        return [0] * n
    
    # Truncate or pad to fixed length
    x_list = list(x)[:n]
    if len(x_list) < n:
        x_list.extend([0] * (n - len(x_list)))
    
    return x_list

# --- Memory-efficient expansion with chunked processing ---
def expand_list_columns_chunked(df, list_cols, chunk_size=50000):
    """Expand list columns in chunks to avoid memory issues"""
    print(f"Expanding {len(list_cols)} list columns in chunks of {chunk_size}...")
    
    # Process each list column one at a time to minimize memory usage
    for col in tqdm(list_cols, desc="Expanding columns"):
        print(f"\nExpanding: {col}")
        
        # Process in chunks
        expanded_chunks = []
        for i in range(0, len(df), chunk_size):
            chunk = df.iloc[i:i+chunk_size]
            
            # Extract 5 numeric values per row → matrix
            expanded_data = chunk[col].apply(lambda x: extract_fixed(x, n=5))
            expanded_chunk = pd.DataFrame(expanded_data.tolist(), 
                                        columns=[f"{col}_{i+1}" for i in range(5)])
            expanded_chunks.append(expanded_chunk)
            
            # Clear memory
            del expanded_data
            gc.collect()
        
        # Combine all chunks for this column
        expanded_df = pd.concat(expanded_chunks, ignore_index=True)
        
        # Add to main dataframe
        df = pd.concat([df, expanded_df], axis=1)
        
        # Remove the original list column
        df = df.drop(columns=[col])
        
        # Clear memory
        del expanded_df, expanded_chunks
        gc.collect()
        
        print(f"Completed {col}")
    
    return df

# --- Expand list columns with memory management ---
ads_train_enriched = expand_list_columns_chunked(ads_train_enriched, list_cols)

# --- Print summary of final dataset ---
print("\n✅ Expansion complete!")
print(f"Final dimensions: {ads_train_enriched.shape[0]} rows x {ads_train_enriched.shape[1]} columns")

# Optional: quick check
print("\nDataset info:")
print(ads_train_enriched.info())


# Try PCA:
- see if we can include the vector features in PCA as well
- scale and normalize data
- split into training and validation sets before training on the training set


# Try Logistic Regression:


## first scale data:


In [None]:
import random
random.seed(123)  # reproducibility
np.random.seed(123)

# --- 1. Split into training (80%) and validation (20%) ---
train_data, val_data = train_test_split(ads_train_enriched, test_size=0.2, random_state=123)

# --- 2. Select numeric predictors, excluding IDs and label ---
numeric_cols = train_data.select_dtypes(include=[np.number]).columns.tolist()
predictor_cols = [col for col in numeric_cols if col not in ["label", "user_id", "log_id"]]

# --- 3. Remove constant / near-constant predictors ---
non_constant_cols = [col for col in predictor_cols 
                    if train_data[col].std() > 0]
print(f"Removed {len(predictor_cols) - len(non_constant_cols)} constant predictors")
predictor_cols = non_constant_cols

# --- 4. Prepare train & validation as data.frames ---
train_df = train_data[predictor_cols + ["label"]].copy()
val_df = val_data[predictor_cols + ["label"]].copy()

# Convert label to numeric 0/1
train_df['label'] = train_df['label'].astype(int)
val_df['label'] = val_df['label'].astype(int)

# --- 5. Scale numeric predictors ---
scaler = StandardScaler()
train_df_scaled = train_df.copy()
val_df_scaled = val_df.copy()

train_df_scaled[predictor_cols] = scaler.fit_transform(train_df[predictor_cols])
val_df_scaled[predictor_cols] = scaler.transform(val_df[predictor_cols])

print(f"Training data shape: {train_df_scaled.shape}")
print(f"Validation data shape: {val_df_scaled.shape}")


In [None]:
# --- 6. Create formula and fit logistic regression ---
X_train = train_df_scaled[predictor_cols]
y_train = train_df_scaled['label']
X_val = val_df_scaled[predictor_cols]
y_val = val_df_scaled['label']

# --- 7. Fit logistic regression ---
logit_model = LogisticRegression(random_state=123, max_iter=1000)
logit_model.fit(X_train, y_train)

# --- 8. Predict on validation data ---
val_pred_prob = logit_model.predict_proba(X_val)[:, 1]
val_pred_label = (val_pred_prob > 0.5).astype(int)

# --- 9. Evaluate accuracy ---
accuracy = accuracy_score(y_val, val_pred_label)
print(f"Validation Accuracy: {accuracy:.3f}")

# Additional metrics
print("\nClassification Report:")
print(classification_report(y_val, val_pred_label))
