In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from collections import Counter
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_curve, auc, confusion_matrix
)
from sklearn.svm import OneClassSVM
from sklearn.manifold import TSNE
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    AdaBoostClassifier
)
from xgboost import XGBClassifier  
#import xgboost
from sklearn.linear_model import LogisticRegression
import os
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")


In [None]:
class PREPROCESSING:
  
    def __init__(self,train_data_path, item_data_path,test_data_path):
        '''
    
        This class give information about the datasets
    
        '''
        self.df_train = pd.read_csv(train_data_path)
        self.df_item = pd.read_csv(item_data_path)
        self.df_test = pd.read_csv(test_data_path)
        
    def handle_duplicates_complete(self):
        '''
        
        This method handles the duplicates in the datasets
        
        '''
        self.df_train_duplicate_handled = self.df_train.drop_duplicates()
        self.df_item_duplicate_handled = self.df_item.drop_duplicates()
        self.df_test_duplicate_handled = self.df_test.drop_duplicates()
    def check_missing_value(self):
        '''
        
        Check if there are any missing values
        
        '''    
        
        self.df_train_missing_values = self.df_train_duplicate_handled.isnull().sum()
        self.df_item_missing_values = self.df_item_duplicate_handled.isnull().sum()
        self.df_test_missing_values = self.df_test_duplicate_handled.isnull().sum()
    def transform_date(self):
        '''
        
        Transform object to datetime type for 'ymd'
        
        '''    
        self.df_train_date_transformed = self.df_train_duplicate_handled.copy()
        self.df_train_date_transformed['ymd'] = pd.to_datetime(self.df_train_duplicate_handled['ymd'])   
        self.df_test_date_transformed = self.df_test_duplicate_handled.copy()
        self.df_test_date_transformed['ymd'] = pd.to_datetime(self.df_test_duplicate_handled['ymd'])
    def encode_categorical_features(self):
        '''
        
        Label encode 'platform' because it has 13 unique values 
        and 
        map 'device' with 0 or 1 because it has just 2 unique values 'DESKTOP' and 'MOBILE'
        
        '''
        
        label_encoder = LabelEncoder()
        self.df_train_encoded = self.df_train_date_transformed.copy()
        self.df_test_encoded = self.df_test_date_transformed.copy()
        self.df_train_encoded['platform'] = label_encoder.fit_transform(self.df_train_encoded['platform'])
        self.df_test_encoded['platform'] = label_encoder.fit_transform(self.df_test_encoded['platform'])
        binary_mapping ={'MOBILE':0,'DESKTOP':1}
        self.df_train_encoded['device'] =self.df_train_encoded['device'].map(binary_mapping)
        self.df_test_encoded['device']=self.df_test_encoded['device'].map(binary_mapping)
        
    def merge_data(self):
        '''
        
        Merge Train data with additional information provided by item data with inner join to have complete set and 
        left join to include all item_id in train and test set.
        
        '''
        '''
        Known Items alone in train
        '''
        self.df_merged_with_known_item_info = pd.merge(self.df_train_encoded,self.df_item_duplicate_handled,on='item_id')
        self.df_merged_with_known_item_info['ymd']= pd.to_datetime(self.df_merged_with_known_item_info['ymd'],errors ='coerce')
        self.df_merged_with_known_item_info['day_of_week'] = self.df_merged_with_known_item_info['ymd'].dt.dayofweek
        
        '''
        
        With all items in train
        
        '''
        self.df_merged_train = pd.merge(self.df_train_encoded,self.df_item_duplicate_handled,on='item_id',how='left')
        self.df_merged_train['ymd']= pd.to_datetime(self.df_merged_train['ymd'],errors ='coerce')
        self.df_merged_train['day_of_week'] = self.df_merged_train['ymd'].dt.dayofweek 
        
        '''
        
        With all items in test
        
        '''
        self.df_merged_test = pd.merge(self.df_test_encoded,self.df_item_duplicate_handled,on='item_id',how='left')
        self.df_merged_test['ymd']= pd.to_datetime(self.df_merged_test['ymd'],errors ='coerce')
        self.df_merged_test['day_of_week'] = self.df_merged_test['ymd'].dt.dayofweek 
        
    def check_feature_importance(self):
        '''
        
        Check feature importance using Random Forest with known item info in train set
        
        '''  
        self.df_features = self.df_merged_with_known_item_info.drop(columns=['is_clicked_item','ymd']) 
        self.df_truth = self.df_merged_with_known_item_info['is_clicked_item']
        rf = RandomForestClassifier(n_estimators=100, random_state = 42)
        rf.fit(self.df_features,self.df_truth)
        self.importance = rf.feature_importances_
        
    def missing_values_after_merging(self):
        '''
        
        Check the missing values after merging the dataset
        
        '''    
        self.missing_items = self.df_merged_train[self.df_merged_train['star'].isnull()]['item_id']
    def handle_missing_values(self):
        '''
        
        Handle the missing values after merging the information regarding item
        
        '''   
        self.df_merged_train_missing_value_handled = self.df_merged_train.copy()
        self.df_merged_test_missing_value_handled = self.df_merged_test.copy()
        '''
        Mode for 'star'
        '''
        # Train set
        self.train_star_mode = self.df_merged_train['star'].mode()[0]  
        self.df_merged_train_missing_value_handled.loc[self.df_merged_train_missing_value_handled['star'].isnull(),'star'] = self.train_star_mode
        # Test Set
        self.test_star_mode = self.df_merged_test['star'].mode()[0]  
        self.df_merged_test_missing_value_handled.loc[self.df_merged_test_missing_value_handled['star'].isnull(),'star'] = self.test_star_mode
        '''
        Median for 'num_rating'
        
        '''
        # Train set
        self.train_num_rating_median = self.df_merged_train['num_rating'].median()
        self.df_merged_train_missing_value_handled.loc[self.df_merged_train_missing_value_handled['num_rating'].isnull(),'num_rating'] = self.train_num_rating_median
        
        # Test set
        self.test_num_rating_median = self.df_merged_test['num_rating'].median()
        self.df_merged_test_missing_value_handled.loc[self.df_merged_test_missing_value_handled['num_rating'].isnull(),'num_rating'] = self.test_num_rating_median
        '''
        Median for 'avg_rating'
        '''
        # Train set
        self.train_avg_rating_median = self.df_merged_train['avg_rating'].median()
        self.df_merged_train_missing_value_handled.loc[self.df_merged_train_missing_value_handled['avg_rating'].isnull(),'avg_rating'] = self.train_avg_rating_median
        # Test set
        self.test_avg_rating_median = self.df_merged_test['avg_rating'].median()
        self.df_merged_test_missing_value_handled.loc[self.df_merged_test_missing_value_handled['avg_rating'].isnull(),'avg_rating'] = self.test_avg_rating_median
        
        '''
        use Mode for 'type_id' since has a dominant mode
        
        '''
        # Train set
        self.train_type_id_mode = self.df_merged_train['type_id'].mode()[0]
        self.df_merged_train_missing_value_handled.loc[self.df_merged_train_missing_value_handled['type_id'].isnull(),'type_id'] = self.train_type_id_mode
        
        # Test set
        self.test_type_id_mode = self.df_merged_test['type_id'].mode()[0]
        self.df_merged_test_missing_value_handled.loc[self.df_merged_test_missing_value_handled['type_id'].isnull(),'type_id'] = self.test_type_id_mode
        '''
        
        'city_id' can also be filled using mode value since it is a categorical value
        
        '''
        # Train set
        self.train_city_id_mode = self.df_merged_train['city_id'].mode()[0]
        self.df_merged_train_missing_value_handled.loc[self.df_merged_train_missing_value_handled['city_id'].isnull(),'city_id'] = self.train_city_id_mode
        # Test set
        self.test_city_id_mode = self.df_merged_test['city_id'].mode()[0]
        self.df_merged_test_missing_value_handled.loc[self.df_merged_test_missing_value_handled['city_id'].isnull(),'city_id'] = self.test_city_id_mode
    def scale_numerical_features(self):
        '''
       
        Scale numerical features using standardscaler 
       
        '''
        # Train set
        self.df_scaled = self.df_merged_train_missing_value_handled.copy()
        self.num_columns= ['price','num_rating','avg_rating']
        self.df_num_data= self.df_merged_train_missing_value_handled[self.num_columns]
        self.scaler = StandardScaler()
        self.scaled_num_data = scaler.fit_transform(self.df_num_data)
        self.df_scaled[self.num_columns] = pd.DataFrame(self.scaled_num_data, columns=self.num_columns)
        
        # Test set
        self.df_test_scaled = self.df_merged_test_missing_value_handled.copy()
        self.df_test_num_data= self.df_merged_test_missing_value_handled[self.num_columns]
        self.scaled_test_num_data = scaler.fit_transform(self.df_test_num_data)
        self.df_test_scaled[self.num_columns] = pd.DataFrame(self.scaled_test_num_data, columns=self.num_columns)
    def frequency_encode(self,id_columns,df):
        '''
        
        Encode all id's with frequency encoding because of high cardinality
        
        Also encode 'platform' and 'item_position' which are having high cardinality
        
        '''   
        # Train set
        self.id_columns = ['user_id','search_id','item_id','city_id','type_id','platform','item_position']
        id_columns =  self.id_columns
        self.df_id_data= self.df_scaled[self.id_columns]
        df= self.df_id_data
        for col in id_columns:
            freq = df[col].value_counts() / len(df)
            df.loc[:, col ] = df[col].map(freq).astype('float64')
        self.df_scaled.loc[:,self.id_columns] = df 
        # Test set
        self.df_test_id_data =self.df_test_scaled[self.id_columns]
        df_test = self.df_test_id_data
        for col in id_columns:
            freq_test = df_test[col].value_counts() / len(df_test)
            df_test.loc[:,col] = df_test[col].map(freq_test).astype('float64')
        self.df_test_scaled.loc[:,self.id_columns] = df_test    
    def scale_star(self):
        '''
        
        'star' feature is following a right skewed distribution, so it is better to scale it using log transformation
        
        '''  
        # Train set
        self.df_scaled['star'] = (np.log(self.df_scaled['star']+1) -1) 
        # Test set
        self.df_test_scaled['star'] = (np.log(self.df_test_scaled['star']+1)-1)
    def scale_day_of_week(self):
        '''
        
        scale day of week using cyclical encoding
        
        ''' 
        # Train set
        period = 7
        self.df_scaled['day_of_week_sin']=(np.sin(2 * np.pi * self.df_scaled['day_of_week'] / period) + 1)/ 2
        self.df_scaled['day_of_week_cos']= (np.cos(2* np.pi * self.df_scaled['day_of_week'] / period) + 1)/ 2 
        
        # Test set
        self.df_test_scaled['day_of_week_sin']=(np.sin(2 * np.pi * self.df_test_scaled['day_of_week'] / period) + 1)/ 2
        self.df_test_scaled['day_of_week_cos']= (np.cos(2* np.pi * self.df_test_scaled['day_of_week'] / period) + 1)/ 2 
    def remove_correlated(self,threshold):
        '''
        
        Remove the correlated columns
        
        '''   
        # Train set
        self.df_scaled_features = self.df_scaled.copy()
        self.df_label = self.df_scaled.loc[:,['is_clicked_item']]
        self.df_scaled_features.drop(columns=['ymd','is_clicked_item','day_of_week'], inplace = True)
        self.df_corr = self.df_scaled_features.corr(method='pearson', min_periods=1)
        self.df_not_correlated = ~(self.df_corr.mask(np.tril(np.ones([len(self.df_corr)]*2, dtype=bool))).abs() > threshold).any()
        self.un_corr_idx = self.df_not_correlated.loc[self.df_not_correlated[self.df_not_correlated.index] == True].index
        self.df_uncorr = self.df_scaled_features[self.un_corr_idx]
        self.df_uncorr['is_clicked_item']=self.df_label['is_clicked_item']
        
        # Test
        self.df_test_scaled_features = self.df_test_scaled.copy()
        self.df_test_uncorr = self.df_test_scaled_features[self.un_corr_idx]
    def drop_feature_with_less_importance(self , feature_imp_threshold):
        '''
        Drop the feature with less importance
        
        '''
        # Train set
        self.X = self.df_uncorr[self.df_uncorr.columns[:-1]]
        self.y = self.df_uncorr[['is_clicked_item']]
        clf = ExtraTreesClassifier(n_estimators=50)
        clf = clf.fit(self.X, self.y)
        self.feature_imp = clf.feature_importances_  
        self.model = SelectFromModel(clf, prefit=True, threshold= feature_imp_threshold)
        self.X_new = self.model.transform(self.X)
        
    def optimal_feature(self): 
        """
        Retain the features with importance
        """
        # Train set
        self.selected_feature_indices = self.model.get_support()
        self.selected_feature_names = self.df_uncorr.columns[:-1][self.selected_feature_indices]
        self.df_independent_feature = pd.DataFrame(data=self.X_new, columns=self.selected_feature_names)    
        
        # Test set
        self.df_test_independent_feature = self.df_test_uncorr.loc[:,self.selected_feature_names]
    def process(self):
        '''
        
        Process all the methods inside the class
        
        '''
        self.handle_duplicates_complete()
        self.check_missing_value()
        self.transform_date()
        self.encode_categorical_features()
        self.merge_data()
        self.check_feature_importance()
        self.missing_values_after_merging()
        self.handle_missing_values()
        self.scale_numerical_features()
        self.frequency_encode(id_columns,df_id_data)
        self.scale_star()
        self.scale_day_of_week()
        self.remove_correlated(threshold = 0.9)
        self.drop_feature_with_less_importance(feature_imp_threshold=0.005)
        self.optimal_feature()
        return self.df_independent_feature, self.df_label, self.df_test_independent_feature

In [None]:
PP = PREPROCESSING('train.csv','item.csv','test.csv')
df_independent_feature, df_label,df_test_independent_feature = PP.process()

### 0. All the datasets are loaded 

In [None]:
PP.df_train.head()


In [None]:
PP.df_item.head()

### 1. No duplicates are found in the given dataset

### 2. Missing values are checked in all the all datasets

#### a.Missing values in Train Data

In [None]:
missing_values_train = PP.df_train_missing_values
plt.figure(figsize=(10, 6))
missing_values_train.plot(kind='bar', color='blue')
plt.xlabel('Columns')
plt.ylabel('Number of Missing Values in train set')
plt.title('Missing Values in Each Column in train set')
plt.show()

#### b.Missing values in Item Data

In [None]:
missing_values_item = PP.df_item_missing_values
plt.figure(figsize=(10, 6))
missing_values_item.plot(kind='bar', color='blue')
plt.xlabel('Columns')
plt.ylabel('Number of Missing Values in item data')
plt.title('Missing Values in Each Column in additional information about item')
plt.show()

#### c. Missing values in Test set

In [None]:
missing_values_test = PP.df_test_missing_values
plt.figure(figsize=(10, 6))
missing_values_test.plot(kind='bar', color='blue')
plt.xlabel('Columns')
plt.ylabel('Number of Missing Values in test set')
plt.title('Missing Values in Each Column in test set')
plt.show()

### 3. Transformed 'ymd' column to datetime format

### 4. Encoded categorical fetaures in train set and test set. 'platform' is encoded using label encoding and 'device' is encoded using mapping.

### 5. Merged item information with train set using left join and added an additional column called 'day_of_week' which will be important while booking accomodation. Not all item_id listed in train set not there in item.csv, left join will result in some missing values.

### 6. check the importance of features by making use of an inner joined train.csv and item.csv with no missing values.This will help us check if all columns are important to be considered.

In [None]:
feature_importance_df = pd.DataFrame({
    'Feature': PP.df_features.columns,
    'Importance': PP.importance
}).sort_values(by='Importance', ascending=False)

print(feature_importance_df)

### The results shows that it is better consider all columns at the beginning

### 7.Check for missing values which resulted after merging with item.csv

In [None]:
missing_values = PP.df_merged_train.isnull().sum()
plt.figure(figsize=(10, 6))
missing_values.plot(kind='bar', color='blue')
plt.xlabel('Columns')
plt.ylabel('Number of Missing Values')
plt.title('Missing Values in Each Column')
plt.show()

#### The figure shows that around 20% of data in 'avg_rating', 'star','num_rating','type_id' and 'city_id' are missing. It is not a good idea to drop columns with missing values since it can result in informatation loss. So it is better to handle the missing values.

### 8. Handle missing values
### 'avg_rating' and 'num_rating' are continous values so it is better to impute missing values in these columns using median.
### 'star' is an ordinal value so it is better to use mode value for imputing missing values.
### 'type_id' and 'city_id' are unique identifiers so we have to check how the data is distributed in those two cases and decide on which method to use

In [None]:
# Plot for type_id
plt.figure(figsize=(10, 5))
sns.countplot(x='type_id', data=PP.df_merged_with_known_item_info)
plt.title('Distribution of Type ID')
plt.xlabel('Type ID')
plt.ylabel('Frequency')
plt.show()




### 'type_id' clearly have a dominant mode so it is better to impute missing values using mode


In [None]:
# Plot for city_id
plt.figure(figsize=(10, 5))
sns.countplot(x='city_id', data=PP.df_merged_with_known_item_info)
plt.title('Distribution of City ID')
plt.xlabel('City ID')
plt.ylabel('Frequency')
plt.show()

In [None]:
PP.df_merged_train_missing_value_handled['city_id'].value_counts()

### In case of 'city_id' eventough it does not have a dominant mode being a unique identifier with around 6846 unique values, it is better go with mode value in this case as well

#### The 'type_id' clearly has mode 2 as since it is a unique identifier it is better to fill missing values corresponding to this column with mode

### 9. Check Imbalance in 'is_clicked_item'

In [None]:
count_by_clicked_item = PP.df_merged_train_missing_value_handled.groupby(['is_clicked_item']).size()
count_by_clicked_item

In [None]:
count_by_clicked_item.plot(kind='pie', autopct='%1.1f%%')
plt.title('Percentage of Class Imbalance')
plt.show()

### The data is clearly imbalanced. It has to considered while training the model. Otherwise sampling methods could also be used but it might result in information loss or can bring overfitting, so better to take care of imbalance while training. 

### 

### 10 Scale Data

#### Check the cardinality of each feature

In [None]:

PP.df_merged_train_missing_value_handled.nunique()

#### Check data type

In [None]:
PP.df_merged_train_missing_value_handled.dtypes

#### Methods used to scale features

### a. 'price', 'avg_rating','num_rating' are only completely numerical features so standardscaler can be used to scale these features


### b. 'user_id','search_id','item_id','type_id' and 'city_id' are unique identifiers. 'platform' and item_position are also high cardinality features so all these features acn be frequency encoded rather than scaling. 

### c. 'star' clearly is an ordinal feature, first visualize how star rating is distributed

In [None]:
bins = [0, 1, 2, 3, 4, 5, 6]

# Plot histogram
plt.hist(PP.df_merged_train_missing_value_handled['star'], bins=bins, edgecolor='black')

# Add labels and title
plt.xlabel('Star Ratings')
plt.ylabel('Frequency')
plt.title('Distribution of Star Ratings')

# Show plot
plt.show()


### It is right skewed, so it is better to do a log transformation here to compress the range

### d. 'search_type', 'device' are managable features with less cardinalilty so it is better use one hot encoding which prevent the model from assumng any ordinal relationship or frequency encoding can also used.

### e.'day_of_week' is scaled using cyclical encoding inorder to capture the cyclical nature

In [None]:
PP.df_scaled.head()

### 11. Feature selection

In [None]:
for column in PP.df_corr.columns:
        plt.figure(figsize=(10, 4))
        
        # Plotting the column values
        plt.subplot(1, 3, 1)
        sns.boxplot(PP.df_corr[column])
        plt.title(column)
        
        # Plotting the outliers
        plt.subplot(1, 3, 2)
        sns.boxplot(PP.df_corr[column])
        sns.stripplot(PP.df_corr[column], color='red', marker='o', alpha=0.5)
        plt.title("Outliers")
        
        # Plotting the skewness
        plt.subplot(1, 3, 3)
        sns.histplot(PP.df_corr[column], kde=True)
        plt.title("Skewness")
        plt.xlabel(column)
        
        plt.tight_layout()
        plt.show()

### 11.2 Check Correlation between features

#### Separate all scaled features( remove 'ymd','day_of_week') and labels ('is_clicked_item')

In [None]:
PP.df_scaled_features.columns

In [None]:
PP.df_label.columns

### Correlated features

In [None]:
corr_matrix = PP.df_corr.iloc[:,:].corr()
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

### Correlation between features removed

In [None]:
corr_matrix = PP.df_uncorr.iloc[:,:-1].corr()
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

### 12. Check correlation with ground truth here 'is_clicked_item'

In [None]:
features = PP.df_uncorr.columns
columns = [col for col in features if col != 'is_clicked_item']
for feature in columns:
    plt.figure(figsize=(8, 6))
    sns.heatmap(PP.df_uncorr[[feature,'is_clicked_item']].corr(), annot=True, cmap='coolwarm')
    plt.title(f'Correlation Chart for {feature} and click')
    plt.show()

### 13. Drop features which are less important

In [None]:
features = columns
importances = PP.feature_imp
indices = np.argsort(importances)

plt.figure(figsize=(10,10))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()

### 14. Select Optimal features 

In [None]:
PP.df_independent_feature.head()

### With this Preprocessing is completed and the class returns df_independent_feature, df_label and df_test_independent_feature

In [None]:
df_label

## Model Selection

In [None]:

class Model_Selection:
    def __init__(self, df_independent_feature, df_label):
        """
        Initialize with independent features and labels.
        """
        self.df_independent_feature = df_independent_feature
        self.df_label = df_label

    def outlier_detection_ocsvm(self):
        """
        Detect outliers using One-Class SVM and remove them from the dataset.
        """
        self.x_reduced = TSNE(n_components=2, random_state=0).fit_transform(self.df_independent_feature)
        svm = OneClassSVM(nu=0.005, gamma=1e-04)
        svm.fit(self.x_reduced)
        self.x_predicted = svm.predict(self.x_reduced)
        
        # Remove outliers
        self.df_independent_no_out = self.df_independent_feature[self.x_predicted == 1]
        self.df_label_no_out = self.df_label[self.x_predicted == 1]

    def grid_search(self):
        """
        Perform grid search to find the best model and hyperparameters.
        """
        X_train, X_test, y_train, y_test = train_test_split(
            self.df_independent_no_out,
            self.df_label_no_out,
            test_size=0.2,
            random_state=42
        )
        y_train = y_train['is_clicked_item'].astype(int)
        # Define parameter grids for models
        param_grids = {
            'Random Forest': {
                'model': RandomForestClassifier(),
                'params': {
                    'min_samples_split': [10, 20, 30],
                    'n_estimators': [100, 200],
                    'max_depth': [4, 5, 6, 7],
                    'class_weight': ['balanced']
                }
            },
            'Gradient Boosting': {
                'model': GradientBoostingClassifier(),
                'params': {
                    'n_estimators': [100, 200],
                    'learning_rate': [0.01, 0.1],
                    'max_depth': [3, 4, 5],
                    'min_samples_split': [2, 5]
                }
            },
            'AdaBoost': {
                'model': AdaBoostClassifier(),
                'params': {
                    'n_estimators': [50, 100],
                    'learning_rate': [1.0, 0.5]
                }
            },
            'XGBoost': {
                'model': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
                'params': {
                    'n_estimators': [50, 100, 150, 200, 250, 500],
                    'max_depth': [2, 3, 4, 6, 7, 8],
                    'learning_rate': [0.01, 0.1, 0.2, 0.3, 0.4],
                    'gamma': [0, 5, 10, 20, 50, 100],
                    'scale_pos_weight': [sum(y_train == 0) / sum(y_train == 1)]
                }
            }
        }

        self.model_results = {}
        self.roc_curves = {}

        for model_name, config in param_grids.items():
            print(f"Performing grid search for {model_name}")
            grid_search = GridSearchCV(config['model'], config['params'], cv=5, scoring='roc_auc')
            grid_search.fit(X_train, y_train)

            best_model = grid_search.best_estimator_
            y_pred_proba = best_model.predict_proba(X_test)[:, 1]
            
            fpr, tpr, _ = roc_curve(y_test['is_clicked_item'].values.flatten(), y_pred_proba)
            test_accuracy = auc(fpr, tpr)

            # Store results
            self.model_results[model_name] = {
                "best_model": best_model,
                "params": grid_search.best_params_,
                "test_accuracy": test_accuracy,
                "train_accuracy": grid_search.best_score_  # Average score from cross-validation
            }
            self.roc_curves[model_name] = {'fpr': fpr, 'tpr': tpr}

    def calculate_metrics(self, y_true, y_pred):
        """
        Calculate evaluation metrics.
        """
        accuracy = accuracy_score(y_true.flatten(), y_pred)
        precision = precision_score(y_true.flatten(), y_pred)
        recall = recall_score(y_true.flatten(), y_pred)
        f1 = f1_score(y_true.flatten(), y_pred)

        return accuracy, precision, recall, f1

    def best_model_score(self):
        """
        Evaluate the best model after grid search.
        """
        X_train, X_test, y_train, y_test = train_test_split(
            self.df_independent_no_out,
            self.df_label_no_out,
            test_size=0.2,
            random_state=42
        )

        best_model_name = max(self.model_results.keys(), key=lambda k: self.model_results[k]['test_accuracy'])
        
        best_model = self.model_results[best_model_name]['best_model']
        
        # Fit the best model on training data
        best_model.fit(X_train.values.reshape(-1,len(X_train.columns)), y_train.values.flatten())

        # Predictions on test set
        y_pred_test = best_model.predict(X_test.values.reshape(-1,len(X_test.columns)))
        
        # Calculate metrics
        accuracy, precision, recall, f1 = self.calculate_metrics(y_test['is_clicked_item'].values.flatten(), y_pred_test)

        # Store metrics in DataFrame
        self.metrics_df = pd.DataFrame({
            'Metrics': ['Accuracy', 'Precision', 'Recall', 'F1 Score'],
            'Values': [accuracy, precision, recall, f1]
        })

    def thresholding(self):
        """
        Find optimal threshold value based on evaluation metrics.
        """
        thresholds = []
        f1_scores = []
        
        y_pred_prob = self.y_pred_prob[:, 1]  # Assuming second column is positive class probability
        
        for threshold in np.arange(0.1, 0.5, 0.01):
            y_pred = (y_pred_prob >= threshold).astype(int)
            
            f1 = f1_score(self.y_true.values.flatten(), y_pred)
            thresholds.append(threshold)
            f1_scores.append(f1)

        # Store results in DataFrame
        self.threshold_df = pd.DataFrame({'Threshold': thresholds,'F1 Score': f1_scores})
        
    def perform_threshold(self):
        """
        Apply a determined threshold and calculate new evaluation metrics.
        """
        threshold_value = 0.4  
         
        # Perform thresholding on predictions
        y_pred_thresholded = (self.y_pred_prob[:, 1] >= threshold_value).astype(int) 
         
        # Calculate metrics after applying threshold
        metrics_values = {
            'Precision': precision_score(self.y_true.values.flatten(), y_pred_thresholded),
            'Recall': recall_score(self.y_true.values.flatten(), y_pred_thresholded),
            'F1 Score': f1_score(self.y_true.values.flatten(), y_pred_thresholded),
            'Accuracy': accuracy_score(self.y_true.values.flatten(), y_pred_thresholded)
        }

        # Store results in DataFrame
        self.thresholded_metrics_df = pd.DataFrame.from_dict(metrics_values , orient='index', columns=['Score'])

    def process(self):
        """
        Execute the full process of outlier detection and model selection.
        """
        self.outlier_detection_ocsvm()
        self.grid_search()
        self.best_model_score()
        #self.thresholding()
        #self.perform_threshold()


In [None]:
MS = Model_Selection(df_independent_feature, df_label)
MS.process()

### 15. Outlier detection and removal outliers 
### Method here uses t-SNE to reduce the dimensionality of the independent features to two dimensions. Then fit a One-class SVM model to predict outliers. Finally it remove outliers from both independent features and labels.

### 16. Grid search to find the best model and its hyperparameters
### Data splitting:- splits the dataset into train and validation set
### Parameter grids:- define parameter grids for different models (Random Forest, Gradient Boosting, Adaboost, XGBoost)
### Grid Search:- For each model, it perform a grid search using cross-validation to find the best hyperparamter based on ROC AUC score
### Result:- It saves the best model, paramters, test accuracy, and ROC curve for each model.

In [None]:
 X_train, X_val, y_train, y_val = train_test_split(
            MS.df_independent_no_out,
            MS.df_label_no_out,
            test_size=0.2,
            random_state=42
        )

In [None]:
MS.model_results

In [None]:
best_model_name = max(MS.model_results.keys(), key=lambda k: MS.model_results[k]['test_accuracy'])
        
best_model = MS.model_results[best_model_name]['best_model']
best_model

### 17. Evaluation Metrics
### Evaluation metrics used are accuracy, precision, recall and F1 score.
### Here evaluation is done making use of true labels in validation set and predicted labels. 

In [None]:
best_model.fit(X_train.values.reshape(-1,len(X_train.columns)), y_train.values.flatten())

In [None]:
y_pred_val = best_model.predict(X_val.values.reshape(-1,len(X_val.columns)))

In [None]:
y_pred_prob =best_model.predict_proba(X_val)[:,1]
y_pred_prob

### 18. Best Model Evaluation
### Evaluates the best model obtained from grid search on a validation set
### Identify best model, by selecting the highest training accuracy
### Then it makes predictions on the validation set and calculate evaluation metrics
### save metrics as a dataframe 

In [None]:
MS.metrics_df

In [None]:
MS.metrics_df

In [None]:
MS.roc_curves(best_model)

### 19. Thresholding 
### Finds the optimal threshold value for calssification based on F1 score
### Iterate over possible threshold values
### calcuate F1 score for each threshold
### store results in dataframe

In [None]:
thresholds = []
f1_scores = []
precision_scores = []
recall_scores = []
y_true = y_val
for threshold in np.arange(0.01, 0.2, 0.01):
    y_pred = (y_pred_prob >= threshold)*1
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    thresholds.append(threshold)
    f1_scores.append(f1)
    precision_scores.append(precision)
    recall_scores.append(recall)
threshold_df = pd.DataFrame({'Threshold': thresholds, 'F1': f1_scores, 'Precison': precision_scores, 'Recall': recall_scores})

In [None]:
threshold_df.plot(x="Threshold")

In [None]:
optimal_threshold = thresholds[np.argmax(f1_scores)]
optimal_threshold

### 20 Prediction in test data

### Make test data similar to training data format to make predictions

In [None]:
test_data_reduced = TSNE(n_components=2, random_state=0).fit_transform(df_test_independent_feature)
svm = OneClassSVM(nu=0.005, gamma=1e-04)
svm.fit(test_data_reduced)
# Predict outliers using the trained One-Class SVM
test_data_predicted = svm.predict(test_data_reduced)


### Make predictions on test data and predict the probability of click

In [None]:
y_test_pred_prob = best_model.predict_proba(df_test_independent_feature)[:,1]  # Probability for the positive class
# Apply the threshold to classify
y_test_pred_class = (y_test_pred_prob >= optimal_threshold).astype(int)
    
# Combine results into a DataFrame
predictions = pd.DataFrame({
    'Predicted_Probability': y_test_pred_prob,
    'Predicted_Class': y_test_pred_class
})

In [None]:
y_test_pred_prob

### Save the result as csv file with columns 'user_id', 'search_id','item_id', and 'prob'

In [None]:
df_test_prediction =PP.df_test.iloc[:,1:4]
df_test_prediction['prob'] = y_test_pred_prob
df_test_prediction.to_csv()

In [None]:
df_test_prediction.to_csv('/Users/athirapulickakudysalin/Documents/Job_search/Trivago/Ranking/case-study/prediction_result.csv')

In [None]:
df_test_prediction