In [None]:
!pip install -U lightgbm

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import ast
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgbm
import catboost as cb
import xgboost as xgb 
# from sentence_transformers import SentenceTransformer
from ast import literal_eval
import re
from datetime import datetime
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import KFold
from torch.utils.data import DataLoader, TensorDataset
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)

In [None]:
from datetime import datetime

def get_formatted_time_date():
    now = datetime.now()
    hour = now.strftime("%I").lstrip('0')  # Remove leading zero from hour
    minute = now.strftime("%M")
    am_pm = now.strftime("%p").lower()
    day = now.strftime("%d").lstrip('0')  # Remove leading zero from day
    month = now.strftime("%b").lower()  # Abbreviated month name in lowercase
    year = now.strftime("%Y")
    return f"{hour}-{minute}-{am_pm}-{day}-{month}-{year}"

In [None]:
train = pd.read_csv('/kaggle/input/bitfest-datathon-2025/train.csv')
test = pd.read_csv('/kaggle/input/bitfest-datathon-2025/test.csv')

In [None]:
df = train.copy()

In [None]:
missing_values = df.isnull().sum()
missing_percentages = (missing_values / len(df)) * 100
missing_info = pd.DataFrame({
    'Missing Values': missing_values,
    'Percentage': missing_percentages
})
missing_info

In [None]:
# Plot missing values and percentages
plt.figure(figsize=(12, 15))

# Bar plot for missing percentages
plt.subplot(2,1, 2)
missing_info['Percentage'].plot(kind='bar', color='salmon')
plt.title('Missing Percentages')
plt.xlabel('Columns')
plt.ylabel('Percentage (%)')
plt.xticks(rotation=90)

plt.tight_layout()
plt.savefig('missing.png')
plt.show()

In [None]:
from wordcloud import WordCloud

# Create a word cloud for the 'skills' column
skills_text = ' '.join(df['skills'].dropna().astype(str))

# Generate the word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(skills_text)

# Plot the word cloud
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud for Skills')
plt.tight_layout()
plt.savefig('wordcloud.png')
plt.show()

In [None]:
train.head(5)

In [None]:
numeric_cols = []

# Date Parsing

In [None]:
df = train.copy() 
df.head(3)

In [None]:
df = df[['start_dates', 'end_dates']]
df.head(10)

In [None]:
def parse_date(date_str: str):
   if not isinstance(date_str, str):
       return None 
       
   # Clean input
   date_str = date_str.strip().lower()
   
   # Handle special cases
   special_cases = {'current', 'present', 'ongoing', 'till date', 'till date', '∞', 'n/a'}
   if date_str in special_cases:
       return (6, 2022)
       
   # Rest of the function remains same as before
   # Handle cases ending with XX or xx
   if date_str.endswith('xx'):
       return None
       
   # Dictionary for month names
   month_names = {
       'jan': 1, 'january': 1,
       'feb': 2, 'february': 2,
       'mar': 3, 'march': 3,
       'apr': 4, 'april': 4,
       'may': 5,
       'jun': 6, 'june': 6,
       'jul': 7, 'july': 7,
       'aug': 8, 'august': 8,
       'sep': 9, 'sept': 9, 'september': 9,
       'oct': 10, 'october': 10,
       'nov': 11, 'november': 11,
       'dec': 12, 'december': 12
   }
   
   # Patterns for different date formats
   patterns = [
       # MM/YYYY or M/YYYY
       r'^(\d{1,2})/(\d{4})$',
       # Month YYYY or Month. YYYY
       r'^([a-z]+\.?\s+)(\d{4})$',
       # Month DD, YYYY
       r'^([a-z]+\.?\s+)\d{1,2},\s*(\d{4})$',
       # YYYY only
       r'^(\d{4})$',
       # Season YYYY
       r'^(spring|summer|fall|winter)\s+(\d{4})$'
   ]
   
   # Try each pattern
   for pattern in patterns:
       match = re.match(pattern, date_str)
       if match:
           groups = match.groups()
           
           # Handle MM/YYYY format
           if '/' in date_str:
               month = int(groups[0])
               year = int(groups[1])
               return (month, year) if 1 <= month <= 12 else None
           
           # Handle year-only format
           elif len(groups) == 1:
               year = int(groups[0])
               return None if year < 1900 else (1, year)
           
           # Handle month name formats
           else:
               month_str = groups[0].strip('. ').lower()
               year = int(groups[1])
               
               if month_str in month_names:
                   return (month_names[month_str], year)
                   
   return None

In [None]:
date_cols = ['start_dates', 'end_dates']

In [None]:
def calculate_experience_duration(row):
    # Safely evaluate string representations of lists
    start_dates = eval(row['start_dates']) if isinstance(row['start_dates'], str) else row['start_dates']
    end_dates = eval(row['end_dates']) if isinstance(row['end_dates'], str) else row['end_dates']
    
    total_months = 0

    if not (isinstance(start_dates, list) and isinstance(end_dates, list)):
        return 0
    
    # Process each pair of start and end dates
    for start_str, end_str in zip(start_dates, end_dates):
        start_tuple = parse_date(start_str)
        end_tuple = parse_date(end_str)
        
        if start_tuple and end_tuple:
            start_month, start_year = start_tuple
            end_month, end_year = end_tuple
            
            # Calculate months difference
            months = (end_year - start_year) * 12 + (end_month - start_month)
            if months > 0:
                total_months += months
    
    return total_months

# Example usage:
# Assuming you have a DataFrame with 'start_dates' and 'end_dates' columns
def process_experience_data(df):
    # for c in date_cols:
    #     df[c] = df[c].apply(lambda x: f(x))
    
    # Create a copy to avoid modifying the original DataFrame
    df_copy = df.copy()
    
    # Calculate duration for each row
    df_copy['duration_months'] = df_copy.apply(calculate_experience_duration, axis=1)
    
    # Optionally add years column
    df_copy['duration_years'] = df_copy['duration_months'] / 12
    
    return df_copy

In [None]:
date_numeric_cols = ['duration_months', 'duration_years']
numeric_cols += date_numeric_cols

In [None]:
df = train.copy()
df = df[:]
df = process_experience_data(df)[date_cols + date_numeric_cols]
df.head(3)

# Age, Experience Requirements Parsing

In [None]:
def parse_requirements(df):
    """
    Parse experience and age requirements to create min/max columns.
    Returns DataFrame with 4 new columns:
    - experience_min_years
    - experience_max_years
    - age_min_years
    - age_max_years
    """
    import re
    
    def parse_experience(text):
        if pd.isna(text):
            return None, None
            
        # Convert to string to ensure string operations work
        text = str(text).lower().strip()
        
        # Handle "At least X year(s)" format
        if "at least" in text:
            years = re.findall(r'(\d+)', text)[0]
            return float(years), None
            
        # Handle "X to Y years" format
        elif "to" in text:
            numbers = re.findall(r'(\d+)', text)
            return float(numbers[0]), float(numbers[1])
            
        return None, None

    def parse_age(text):
        if pd.isna(text):
            return None, None
            
        text = str(text).lower().strip()
        
        # Handle "Age at least X years" format
        if "at least" in text:
            years = re.findall(r'(\d+)', text)[0]
            return float(years), None
            
        # Handle "Age at most X years" format
        elif "at most" in text:
            years = re.findall(r'(\d+)', text)[0]
            return None, float(years)
            
        # Handle "Age X to Y years" format
        elif "to" in text:
            numbers = re.findall(r'(\d+)', text)
            return float(numbers[0]), float(numbers[1])
            
        return None, None

    # Create new columns for experience requirements
    df['experience_min_years'], df['experience_max_years'] = zip(*df['experiencere_requirement'].apply(parse_experience))
    df['experience_min_months'] = df['experience_min_years']*12
    df['experience_max_months'] = df['experience_max_years']*12
    
    # Create new columns for age requirements
    df['age_min_years'], df['age_max_years'] = zip(*df['age_requirement'].apply(parse_age))
    
    return df

In [None]:
req_numeric_cols = ['experience_min_years', 'experience_max_years', 'experience_min_months', 'experience_max_months', 'age_min_years', 'age_max_years']
numeric_cols += req_numeric_cols

In [None]:
df = train.copy()
df = df[:3]
df = parse_requirements(df)[req_numeric_cols]
df

# Passing Year Parsing

In [None]:
def extract_max_passing_year(df):
    """
    Extract the maximum passing year from the passing_years column and add it as a new column.
    
    Parameters:
    df (pandas.DataFrame): Input DataFrame with a 'passing_years' column
    
    Returns:
    pandas.DataFrame: DataFrame with new 'passing_year_max' column
    """
    def parse_years(value):
        # Handle None or NaN values
        if pd.isna(value):
            return None
            
        try:
            # If the value is already a string representation of a number
            if isinstance(value, (int, float)):
                return int(value)
                
            # Handle string representations of lists or single values
            if isinstance(value, str):
                # Remove any quotes and brackets
                cleaned = value.replace("'", "").replace('"', "").replace("[", "").replace("]", "")
                # Split on comma if present
                years = [int(year.strip()) for year in cleaned.split(',') if year.strip().isdigit()]
                return max(years) if years else None
                
            # Handle actual lists
            if isinstance(value, (list, tuple)):
                years = [int(year) for year in value if str(year).isdigit()]
                return max(years) if years else None
                
        except (ValueError, TypeError):
            return None
            
        return None
    
    # Create a copy of the DataFrame to avoid modifying the original
    result_df = df.copy()
    
    # Add the new column with extracted maximum years
    result_df['passing_year_max'] = result_df['passing_years'].apply(parse_years)

    result_df['passing_year_max'] = result_df['passing_year_max'] - 2000
    
    return result_df

In [None]:
year_numeric_cols = ['passing_year_max']

In [None]:
df = train.copy()
df = df[:3]
df = extract_max_passing_year(df)[year_numeric_cols]
df

# Total Numeric Columns

In [None]:
numeric_cols = date_numeric_cols + req_numeric_cols + year_numeric_cols

In [None]:
def prepare_numeric_cols(df):
    df = process_experience_data(df)
    df = parse_requirements(df)
    df = extract_max_passing_year(df)
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
    return df

In [None]:
df = train.copy() 
df = prepare_numeric_cols(df)
df.head(3)[numeric_cols]

# Embedders

In [None]:
MAX_FEATURES = 224
VECTOR_SIZE = 100

In [None]:
columns_to_embed = [
        'educationaL_requirements',
        '﻿job_position_name',
        'responsibilities',
        'skills_required',
        'degree_names',
        'major_field_of_studies',
        'positions',
        'related_skils_in_job',
        'skills',
        'career_objective',
        'professional_company_names',
        'experiencere_requirement'
    ]

# Group Embedders

In [None]:
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
import numpy as np
import pandas as pd
import ast

class GroupHybridEmbedder:
    def __init__(self, max_features=MAX_FEATURES, vector_size=VECTOR_SIZE, window=5, min_count=1, k1=1.5, b=0.75):
        self.max_features = max_features
        self.vector_size = vector_size
        self.window = window
        self.min_count = min_count
        self.k1 = k1
        self.b = b
        
        # Initialize embedders for each group
        self.w2v_models = {}
        self.tfidf_vectors = {}
        self.hash_vectors = {}
        self.count_vectors = {}
        self.svd_models = {}
        self.avdl = {}
        self.is_fitted = {}
        
        # Define column groups
        self.column_groups = {
            'skills': [
                'skills_required',
                'related_skils_in_job',
                'skills',
                'responsibilities'
            ],
            'education': [
                'educationaL_requirements',
                'degree_names',
                'major_field_of_studies'
            ],
            'position': [
                '﻿job_position_name',
                'positions',
                'career_objective'
            ],
            'experience': [
                'professional_company_names',
                'experiencere_requirement'
            ]
        }
        
        # Reverse mapping from column to group
        self.column_to_group = {
            col: group for group, cols in self.column_groups.items() 
            for col in cols
        }
    
    def preprocess_text(self, text):
        if pd.isna(text):
            return ''
        text = str(text).lower().strip()
        text = text.replace('"', ' ').replace("'", ' ')
        text = text.replace('- ', ' ').replace('• ', ' ')
        return text
    
    def preprocess_for_w2v(self, text):
        return self.preprocess_text(text).split()
        
    def preprocess_list(self, text):
        if pd.isna(text):
            return ''
        try:
            items = ast.literal_eval(text) if isinstance(text, str) else text
            if isinstance(items, list):
                return ' '.join(str(item) for item in items if item)
        except:
            return str(text)
        return str(text)

    def _combine_group_texts(self, df, group_columns):
        """Combine texts from all columns in a group into a single corpus"""
        all_texts = []
        for column in group_columns:
            texts = [
                self.preprocess_text(t) if column in ['experiencere_requirement', 'career_objective', '﻿job_position_name']
                else self.preprocess_list(t) for t in df[column]
            ]
            all_texts.extend(texts)
        return all_texts

    def fit(self, df):
        """Fit embedders for each group using combined texts from all columns in the group"""
        for group, columns in self.column_groups.items():
            print(f"Fitting group: {group}")
            try:
                processed_texts = self._combine_group_texts(df, columns)
                w2v_texts = [self.preprocess_for_w2v(t) for t in processed_texts]
                
                self.w2v_models[group] = Word2Vec(
                    sentences=w2v_texts,
                    vector_size=self.vector_size,
                    window=self.window,
                    min_count=self.min_count,
                    workers=4
                )
                
                self.tfidf_vectors[group] = TfidfVectorizer(
                    max_features=self.max_features,
                    ngram_range=(1, 3),
                    token_pattern=r'(?u)\b\w[\w-]*\w\b',
                    min_df=1
                )
                tfidf_matrix = self.tfidf_vectors[group].fit_transform(processed_texts)
                
                self.hash_vectors[group] = HashingVectorizer(
                    n_features=self.max_features,
                    ngram_range=(1, 2),
                    alternate_sign=False
                )
                
                self.count_vectors[group] = CountVectorizer(
                    max_features=self.max_features,
                    ngram_range=(1, 2)
                )
                count_matrix = self.count_vectors[group].fit_transform(processed_texts)
                
                n_features = count_matrix.shape[1]
                n_components = min(50, n_features - 1)
                
                if n_components > 0:
                    self.svd_models[group] = TruncatedSVD(n_components=n_components)
                    self.svd_models[group].fit(count_matrix)
                else:
                    self.svd_models[group] = None
                
                self.avdl[group] = count_matrix.sum(1).mean()
                self.is_fitted[group] = True
                
            except Exception as e:
                print(f"Error fitting group {group}: {str(e)}")
                continue
        return self

    def transform(self, texts, column_name, embedding_types=None):
        """
        Transform texts using selected embedding types
        
        Args:
            texts: Input texts to transform
            column_name: Name of the column being transformed
            embedding_types: List of embedding types to use. Options:
                - 'w2v': Word2Vec embeddings
                - 'tfidf': TF-IDF features
                - 'hash': Hash features
                - 'svd': SVD features
                - 'bm25': BM25 features
                - 'pool': Pooled features
                If None, uses all embedding types
        """
        group = self.column_to_group[column_name]
        if not self.is_fitted.get(group):
            raise ValueError(f"Models for group {group} must be fitted first")
            
        all_types = ['w2v', 'tfidf', 'hash', 'svd', 'bm25', 'pool']
        embedding_types = embedding_types or all_types
        
        processed_texts = [
            self.preprocess_text(t) if column_name in ['experiencere_requirement', 'career_objective', '﻿job_position_name']
            else self.preprocess_list(t) for t in texts
        ]
        
        embeddings_list = []
        
        if 'w2v' in embedding_types:
            w2v_texts = [self.preprocess_for_w2v(t) for t in processed_texts]
            w2v_embeddings = []
            for words in w2v_texts:
                word_vectors = [
                    self.w2v_models[group].wv[word]
                    for word in words
                    if word in self.w2v_models[group].wv
                ]
                embedding = np.mean(word_vectors, axis=0) if word_vectors else np.zeros(self.vector_size)
                w2v_embeddings.append(embedding)
            embeddings_list.append(np.array(w2v_embeddings))
            
        if 'tfidf' in embedding_types:
            tfidf_matrix = self.tfidf_vectors[group].transform(processed_texts).toarray()
            embeddings_list.append(tfidf_matrix)
            
        if 'hash' in embedding_types:
            hash_matrix = self.hash_vectors[group].transform(processed_texts).toarray()
            embeddings_list.append(hash_matrix)
            
        if 'svd' in embedding_types:
            count_matrix = self.count_vectors[group].transform(processed_texts)
            if self.svd_models[group] is not None:
                svd_matrix = self.svd_models[group].transform(count_matrix)
                embeddings_list.append(svd_matrix)
            
        if 'bm25' in embedding_types:
            try:
                count_matrix = self.count_vectors[group].transform(processed_texts)
                dl = count_matrix.sum(1).A1
                tf = count_matrix.toarray()
                n_docs = len(processed_texts)
                doc_freqs = np.asarray((count_matrix > 0).sum(0)).ravel()
                idf = np.log((n_docs - doc_freqs + 0.5) / (doc_freqs + 0.5))
                numerator = tf * (self.k1 + 1)
                denominator = tf + self.k1 * (1 - self.b + self.b * dl[:, np.newaxis] / self.avdl[group])
                bm25_matrix = (numerator / denominator) * idf
                embeddings_list.append(bm25_matrix)
            except Exception as e:
                print(f"Warning: BM25 calculation failed for group {group}: {str(e)}")
                
        if 'pool' in embedding_types and 'tfidf' in embedding_types:
            mean_pool = np.mean(tfidf_matrix, axis=1)
            max_pool = np.max(tfidf_matrix, axis=1)
            embeddings_list.append(np.column_stack([mean_pool, max_pool]))
            
        return np.hstack(embeddings_list) if embeddings_list else np.array([])

# Helper functions
def fit_group_embedders(train_df):
    embedder = GroupHybridEmbedder()
    embedder.fit(train_df)
    return embedder

def get_group_embeddings_df(df, embedder, column, embedding_types=None):
    embeddings = embedder.transform(df[column], column, embedding_types)
    return pd.DataFrame(
        {f'{column}_embeddings{i}': embeddings[:, i] for i in range(embeddings.shape[1])},
        index=df.index
    )

In [None]:
# fitted_embedder = fit_embedders(train)
fitted_embedder = fit_group_embedders(train)
# fitted_embedder = fit_group_embedders(train, test)

In [None]:
def preprocess(df, embedding_types=None):
    df = prepare_numeric_cols(df) 
    
    for column in columns_to_embed:
        # train_embeddings = get_embeddings_df(df, fitted_embedder, column)
        train_embeddings = get_group_embeddings_df(df, fitted_embedder, column, embedding_types)
        df = pd.concat([df, train_embeddings], axis=1)
        
    return df

In [None]:
# embedding_types = ["w2v"]

# train_df = preprocess(train, embedding_types)
# test_df = preprocess(test, embedding_types)

In [None]:
# train_df.columns

In [None]:
KFOLD_N_SPLITS = 5

# NN Architecture 

In [None]:
from typing import List, Union, Optional

def generate_architecture(input_dim: int, num_layers: int, initial_width: Optional[int] = None) -> List[int]:
     # If initial width not specified, use next power of 2 >= input_dim
    if initial_width is None:
        initial_width = 2 ** (input_dim - 1).bit_length()
        # Round up to nearest multiple of 32 for better GPU utilization
        initial_width = ((initial_width + 31) // 32) * 32
        initial_width = min(512, initial_width)  # Cap at 512
    
    hidden_dims = []
    current_dim = initial_width
    
    # Calculate how many times to halve the dimension
    # Reserve last few layers for small dimensions (32, 16, 8)
    num_halving_layers = max(0, num_layers - 3)
    halving_frequency = max(1, num_halving_layers // 3)  # Halve dimension every n layers
    
    # Generate main network body
    for i in range(num_layers - 3):
        if i > 0 and i % halving_frequency == 0:
            current_dim = current_dim // 2
        hidden_dims.append(current_dim)
    
    # Add final layers with standard small dimensions
    if num_layers >= 3:
        hidden_dims.extend([32, 16, 8])
    elif num_layers == 2:
        hidden_dims.extend([16, 8])
    elif num_layers == 1:
        hidden_dims.append(8)
    
    return hidden_dims

class FlexibleNN(nn.Module):
    def __init__(
        self,
        input_dim: int,
        num_layers: int,
        initial_width: Optional[int] = None,
        activation: str = 'relu',
        dropout_rate: float = 0.5,
        use_batch_norm: bool = True,
        use_residual: bool = True
    ):
        super(FlexibleNN, self).__init__()
        
        # Generate hidden dimensions
        hidden_dims = generate_architecture(input_dim, num_layers, initial_width)
        
        # Store configuration
        self.use_batch_norm = use_batch_norm
        self.use_residual = use_residual
        
        # Set up activation function
        if activation == 'relu':
            self.activation = nn.ReLU()
        elif activation == 'leaky_relu':
            self.activation = nn.LeakyReLU(negative_slope=0.01)
        elif activation == 'gelu':
            self.activation = nn.GELU()
        else:
            raise ValueError(f"Unsupported activation: {activation}")
        
        # Create layers
        self.layers = nn.ModuleList()
        self.batch_norms = nn.ModuleList()
        self.residual_layers = nn.ModuleList()
        
        # Add input layer
        all_dims = [input_dim] + hidden_dims + [1]  # Add output dimension
        
        # Create the network architecture
        for i in range(len(all_dims) - 1):
            # Main layer
            self.layers.append(nn.Linear(all_dims[i], all_dims[i + 1]))
            
            # Batch normalization (except for the output layer)
            if use_batch_norm and i < len(all_dims) - 2:
                self.batch_norms.append(nn.BatchNorm1d(all_dims[i + 1]))
            
            # Residual connection (except for the output layer)
            if use_residual and i < len(all_dims) - 2:
                self.residual_layers.append(nn.Linear(all_dims[i], all_dims[i + 1]))
        
        # Dropout layer
        self.dropout = nn.Dropout(dropout_rate) if dropout_rate > 0 else None
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        for i in range(len(self.layers)):
            is_last_layer = i == len(self.layers) - 1
            
            if self.use_residual and not is_last_layer:
                residual = self.residual_layers[i](x)
            
            x = self.layers[i](x)
            
            if not is_last_layer:
                if self.use_batch_norm:
                    x = self.batch_norms[i](x)
                x = self.activation(x)
                if self.dropout is not None:
                    x = self.dropout(x)
                if self.use_residual:
                    x = x + residual
            
        return x

# Example usage:
# small_net = FlexibleNN(input_dim=10, num_layers=6)  # Similar to original NNModel
# medium_net = FlexibleNN(input_dim=10, num_layers=7, initial_width=512)  # Similar to BNNModel
# large_net = FlexibleNN(input_dim=10, num_layers=12, initial_width=512)  # Similar to BigNNModel

# Custom network with automatic width determination:
# custom_net = FlexibleNN(
#     input_dim=20,
#     num_layers=5,  # Will automatically determine appropriate layer sizes
#     activation='gelu',
#     dropout_rate=0.3
# )

In [None]:
cols = [
    'educationaL_requirements_embeddings',
    '﻿job_position_name_embeddings',
    'responsibilities_embeddings',
    'skills_required_embeddings',
    'degree_names_embeddings',
    'major_field_of_studies_embeddings',
    'positions_embeddings',
    'related_skils_in_job_embeddings',
    'skills_embeddings',
    'career_objective_embeddings',
    # 'professional_company_names_embeddings',
    # 'experiencere_requirement_embeddings'
]
feature_cols = []
# for col in train_df.columns:
#     ok = False
#     for x in cols:
#         if x in col:
#             ok = True
#     if ok:
#         feature_cols.append(col)

# feature_cols += numeric_cols

In [None]:
len(feature_cols)

In [None]:
params = {
    'global': {
        'kfold_n_splits': 5
    },
    'randomforest': {
        'n_estimators': 100,           # Number of trees in the forest
        'max_depth': None,             # Maximum depth of the trees (None for unlimited)
        'min_samples_split': 2,        # Minimum samples required to split an internal node
        'min_samples_leaf': 1,         # Minimum samples required to be at a leaf node
        'max_features': 'sqrt',        # Number of features to consider when looking for the best split
        'bootstrap': True,             # Whether bootstrap samples are used when building trees
        'oob_score': True,            # Whether to use out-of-bag samples to estimate the generalization score
        'warm_start': False,          # Whether to reuse the solution of the previous call to fit
        'criterion': 'squared_error'   # The function to measure the quality of a split
    },
    'lightgbm': {
        'objective': 'regression_l2',
        'metric': 'l2',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'max_depth': 9,
        'reg_alpha': 0.1,
        'reg_lambda': 0.1,
        'device': 'gpu',
        'gpu_platform_id': 0,
        'gpu_device_id': 0,
        'gpu_use_dp': True
    },
    'xgboost': {
        'objective': 'reg:squarederror',
        'learning_rate': 0.01,
        'max_depth': 10,
        'alpha': 0.1,
        'lambda': 0.1,
        'colsample_bytree': 0.9,
        'tree_method': 'hist',  # Use the 'hist' method for histogram-based training
        'device': 'cuda',  # Use GPU with CUDA
        # 'predictor': 'gpu_predictor',  # Use GPU for prediction
        'subsample': 0.8,  # Use 80% of the data to train each tree
        'gamma': 0.1,  # Minimum loss reduction to make a further partition
        'n_estimators': 1000,  # Number of boosting rounds
        'scale_pos_weight': 1,  # For imbalanced classes
        'min_child_weight': 1,  # Minimum sum of instance weight in a child
        'max_bin': 256,  # Number of bins for histogram-based methods
        'booster': 'gbtree',  # Use tree-based model
        'max_leaves': 31,  # Maximum number of leaves in a tree
        'num_parallel_tree': 1,  # Number of trees to grow in parallel
        # 'verbose': 1
    },
    'catboost': {
        'iterations': 100,
        'learning_rate': 0.05,
        'depth': 9,
        'l2_leaf_reg': 3,
        'random_seed': 42,
        'loss_function': 'RMSE'
    },
    'neuralnetwork': {
        'num_layers': 7,
        'dropout_rate': 0.5,
        'use_batch_norm': True,
        'use_residual': True,
        'learning_rate': 0.001,
        'epochs': 1000,
        'batch_size': 32
    }
}

In [None]:
def train_model(model_type='lightgbm', comment=""):
    # Check if GPU is available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    kf = KFold(n_splits=params['global']['kfold_n_splits'], shuffle=True, random_state=42)
    cv_scores = []
    test_preds = np.zeros(len(test_df))

    for fold, (train_idx, val_idx) in enumerate(kf.split(train_df)):
        print(f"Training fold {fold + 1}...")
        X_train, X_val = train_df.iloc[train_idx][feature_cols], train_df.iloc[val_idx][feature_cols]
        y_train, y_val = train_df.iloc[train_idx]['matched_score'], train_df.iloc[val_idx]['matched_score']

        if model_type == 'randomforest':
            model = RandomForestRegressor(
                **params['randomforest'],
                n_jobs=-1,  # Use all available cores
                random_state=42
            )
            model.fit(X_train, y_train)
            val_preds = model.predict(X_val)
        elif model_type == 'lightgbm':
            train_data = lgbm.Dataset(X_train, label=y_train)
            val_data = lgbm.Dataset(X_val, label=y_val)
            model = lgbm.train(
                params=params['lightgbm'],
                train_set=train_data,
                num_boost_round=1000,
                valid_sets=[train_data, val_data],
                callbacks=[lgbm.early_stopping(stopping_rounds=50)]
            )
            val_preds = model.predict(X_val)
        elif model_type == 'xgboost':
            model = xgb.XGBRegressor(**params['xgboost'],
                                    early_stopping_rounds=50,
                                    enable_categorical=False
                                    )
            model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
            val_preds = model.predict(X_val)
        elif model_type == 'catboost':
            model = cb.CatBoostRegressor(
                **params['catboost'],
                early_stopping_rounds=20,
                verbose=100
            )
            model.fit(
                X_train, y_train,
                eval_set=(X_val, y_val),
                use_best_model=True,
                verbose=False
            )
            val_preds = model.predict(X_val)
        elif model_type == 'neuralnetwork':
            # Prepare data for PyTorch
            X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32).to(device)
            X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32).to(device)
            y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).to(device)
            y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32).to(device)

            input_dim = X_train.shape[1]
            # model = NNModel(input_dim).to(device)

            num_layers = params['neuralnetwork']['num_layers']
            dropout_rate = params['neuralnetwork']['dropout_rate']
            use_batch_norm = params['neuralnetwork']['use_batch_norm']
            use_residual = params['neuralnetwork']['use_residual']
            
            model = FlexibleNN(input_dim, 
                               num_layers=num_layers, 
                               dropout_rate=dropout_rate,
                               use_batch_norm=use_batch_norm,
                               use_residual=use_residual
                              )
            model.to(device)
            
            criterion = nn.MSELoss()
            optimizer = optim.Adam(model.parameters(), lr=params['neuralnetwork']['learning_rate'])
            # optimizer = optim.AdamW(model.parameters(), lr=params['neuralnetwork']['learning_rate'], weight_decay=1e-4)
            
            
            # Training loop
            model.train()
            best_val_loss = float('inf')
            for epoch in range(params['neuralnetwork']['epochs']):
                optimizer.zero_grad()
                outputs = model(X_train_tensor)
                loss = criterion(outputs.squeeze(), y_train_tensor)
                loss.backward()
                optimizer.step()

                if (epoch+1)%100 == 0:
                    # Validation
                    model.eval()
                    with torch.no_grad():
                        val_outputs = model(X_val_tensor)
                        val_loss = criterion(val_outputs.squeeze(), y_val_tensor)
                    
                    print(f"Epoch [{epoch + 1}/{params['neuralnetwork']['epochs']}], "
                          f"Train Loss: {loss.item():.4f}, Validation Loss: {val_loss.item():.4f}")
                    if val_loss.item() < best_val_loss:
                        best_val_loss = val_loss.item()
                        best_model_state = model.state_dict()  # Save the best model state
                    # else:
                    #     break
                    model.train()

            print(f"Best Loss: {best_val_loss}")
            model.load_state_dict(best_model_state)
            # Validation
            model.eval()
            with torch.no_grad():
                val_preds = model(X_val_tensor).cpu().numpy().squeeze()

        fold_score = mean_squared_error(y_val, val_preds)
        cv_scores.append(fold_score)
        
        # For Neural Network, predictions should be made by passing data through the model
        if model_type == 'neuralnetwork':
            test_preds += model(torch.tensor(test_df[feature_cols].values, dtype=torch.float32).to(device)).cpu().detach().numpy().squeeze() / kf.n_splits
        else:
            test_preds += model.predict(test_df[feature_cols]) / kf.n_splits
        
        print(f"MSE: {np.mean(cv_scores):.6f} ± {np.std(cv_scores):.6f}")
    filename=f"submission_{comment}_{get_formatted_time_date()}_{model_type}_{np.mean(cv_scores):.6f}.csv"    
    pd.DataFrame({
        'ID': test_df['ID'],
        'matched_score': test_preds
    }).to_csv(filename, index=False)
    print(f"Submission saved to {filename}")
    return np.mean(cv_scores)

In [None]:
all_types = ['w2v', 'tfidf', 'hash', 'svd', 'bm25', 'pool']
embedding_types_list = [ ["w2v"], ["tfidf"], ["hash"], ["svd"], ["bm25"], ["w2v", "tfidf", "hash"], all_types ]

for embedding_types in embedding_types_list:
    train_df = preprocess(train, embedding_types)
    test_df = preprocess(test, embedding_types)

    feature_cols = []

    for col in train_df.columns:
        ok = False
        for x in cols:
            if x in col:
                ok = True
        if ok:
            feature_cols.append(col)
    
    feature_cols += numeric_cols

    print(len(feature_cols))

    train_model('neuralnetwork', comment=("_".join(embedding_types)))

In [None]:
# for model_name in ['randomforest', 'lightgbm', 'xgboost', 'catboost', 'neuralnetwork']:
#     train_model(model_name)
    
# train_model('randomforest')
# train_model('lightgbm')
# train_model('xgboost')
# train_model('catboost')
# train_model('neuralnetwork')

In [None]:
# from copy import deepcopy

# def parameter_sweep():
#     # Initialize results list
#     results = []
    
#     # Base parameters (current defaults)
#     base_params = {
#         'num_layers': 7,
#         'dropout_rate': 0.5,
#         'use_batch_norm': True,
#         'use_residual': True,
#         'learning_rate': 0.001,
#         'epochs': 1000,
#         'batch_size': 32
#     }
    
#     # Parameter ranges to test
#     param_ranges = {
#         'num_layers': range(3, 15),
#         'dropout_rate': np.arange(0, 1.0, 0.1),
#         'use_batch_norm': [True, False],
#         'use_residual': [True, False]
#     }

#     # param_ranges = {
#     #     # 'num_layers': range(5, 6),
#     #     # 'dropout_rate': np.arange(0, 0.9, 0.2),
#     #     'use_batch_norm': [True, False],
#     #     'use_residual': [True, False]
#     # }
    
#     # Test each parameter independently
#     for param_name, param_values in param_ranges.items():
#         for value in param_values:
#             # Create a copy of base parameters
#             current_params = deepcopy(base_params)
            
#             # Update the current parameter
#             current_params[param_name] = value
            
#             # Update the global params dictionary
#             params['neuralnetwork'] = current_params
            
#             print(f"\nTesting {param_name}: {value}")
            
#             # Train model and get validation score
#             cv_score = train_model(model_type='neuralnetwork')
            
#             # Store results
#             result = {
#                 'parameter': param_name,
#                 'value': str(value),  # Convert to string for consistent CSV storage
#                 'cv_score': cv_score,
#                 # 'config': json.dumps(current_params)
#             }
#             results.append(result)
            
#             # Save intermediate results
#             # pd.DataFrame(results).to_csv('parameter_sweep_results.csv', index=False)
            
#     return results

# # Run parameter sweep
# results = parameter_sweep()
# res_df = pd.DataFrame(results)
# res_df

In [None]:
# res_df.to_csv('parameter_sweep_results.csv', index=False)