# Step 1: Load the Data

In [1]:
import pandas as pd

data = pd.read_csv('data/segment_mapping_data.csv')

# Step 2: Explore the Data

In [2]:
print(data.shape)
print('-'*50)
print(data.head())
print('-'*50)
print(data.describe())
print('-'*50)
print(data.dtypes)
print('-'*50)
print('Checking for missing values:')
print(data.isnull().sum())
print('-'*50)
print('Checking for duplicates:')
print(data.duplicated().sum())

(12479, 9)
--------------------------------------------------
        normalized_data                                        raw_channel  \
0         Tour Operator             4U TRAVEL SRL -HOTELTURIST S.P.A.-INAW   
1      Transient Direct  Accademia Toscana Del Benessere-Accademia Tosc...   
2  Groups Tour Operator  Action Advanced Communication-Action Advanced ...   
3      Transient Direct            Adria Congrex Srl-Adria Congrex Srl-IND   
4                  MICE           Adria Congrex Srl-Adria Congrex Srl-MIAD   

   raw_rate         name  chain_name  average_rooms  average_revenue  \
0  ITIR-HBD  TH Tirrenia  TH Resorts       1.000000        113.55500   
1    TIR-BB  TH Tirrenia  TH Resorts       1.875000        133.86125   
2    TIR-BB  TH Tirrenia  TH Resorts      13.333333       1196.94000   
3    TIR-BB  TH Tirrenia  TH Resorts      12.000000        823.80000   
4    TIR-BB  TH Tirrenia  TH Resorts      18.000000       1404.00000   

   average_guests  alos  
0       26

# Step 3: Data Preprocessing
## Separate features from target

In [3]:
X = data.drop('normalized_data', axis=1)
y = data['normalized_data']

## Standardizing target classes
We have some classes in the target that are the same, but because they slightly differ as strings, will be treated as different classes. Take a look at 'GROUPS SPORT' and 'GROUPS SPORT ', or 'Complimentary' and 'COMPLIMENTARY':

In [4]:
print(y.unique())

['Tour Operator' 'Transient Direct' 'Groups Tour Operator' 'MICE' 'Groups'
 'Transient' 'TransientV' 'DayUse' 'Corporate Dynamic' 'Negotiated'
 'Groups ' 'Not Mapped' 'FIT' 'Tour Operator ' 'Transient '
 'Groups Direct' 'TRANSIENT' 'NEGOTIATED' 'RYANAIR'
 'CORPORATE CONV. DIRETTO' 'FIX - TBD' 'GROUPS' 'HOUSE USE' 'EASYJET'
 'Complimentary' 'Staff' 'CORPORATE' 'CORPORATE_DIRECT ' 'COMPLIMENTARY'
 'TBD' 'SPECIAL' 'Transient DIrect' 'GROUPS SPORT' 'GROUPS SPORT ']


Let's trim and lowercase the target classes to standardize them:

In [5]:
y = y.str.strip().str.lower()
print(y.unique())

['tour operator' 'transient direct' 'groups tour operator' 'mice' 'groups'
 'transient' 'transientv' 'dayuse' 'corporate dynamic' 'negotiated'
 'not mapped' 'fit' 'groups direct' 'ryanair' 'corporate conv. diretto'
 'fix - tbd' 'house use' 'easyjet' 'complimentary' 'staff' 'corporate'
 'corporate_direct' 'tbd' 'special' 'groups sport']


## Handling missing values

In [6]:
X['raw_channel'] = X['raw_channel'].fillna('Unknown')
X['raw_rate'] = X['raw_rate'].fillna('Unknown')
X['chain_name'] = X['chain_name'].fillna('Unknown')

# Display the dataset after filling missing values
print(X.isnull().sum())

raw_channel        0
raw_rate           0
name               0
chain_name         0
average_rooms      0
average_revenue    0
average_guests     0
alos               0
dtype: int64


## Handling features bases on their types

In [7]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

# Define columns
categorical_features = ['raw_channel', 'raw_rate', 'name', 'chain_name']
numerical_features = ['average_rooms', 'average_revenue', 'average_guests', 'alos']

# One-Hot Encoding for categorical features
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Scaling for numerical features
numerical_transformer = StandardScaler()

# Combine all transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features),
    ])

## Encode target variable

In [8]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print('Amount of classes:', len(label_encoder.classes_))

Amount of classes: 25


## Split the data into training and validation sets

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Step 4: Train a Model

In [10]:
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

pipeline = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

pipeline.fit(X_train, y_train)

# Step 5: Evaluation

In [11]:
import numpy as np
from sklearn.metrics import classification_report, accuracy_score

# Get predictions with confidence probabilities
train_probabilities = pipeline.predict_proba(X_train)
test_probabilities = pipeline.predict_proba(X_test)

# Convert probabilities to predictions
train_predictions = np.argmax(train_probabilities, axis=1)
test_predictions = np.argmax(test_probabilities, axis=1)

# Evaluation Metrics
train_accuracy = accuracy_score(y_train, train_predictions)
print(f'Training Accuracy: {train_accuracy * 100:.2f}%')
test_accuracy = accuracy_score(y_test, test_predictions)
print(f'Testing Accuracy: {test_accuracy * 100:.2f}%')
print("\nClassification Report:\n", classification_report(y_test, test_predictions, target_names=label_encoder.classes_, labels=np.unique(y_test), zero_division=0))

# Example of how to use the confidence probabilities
for i in range(len(test_predictions)):
    predicted_class = test_predictions[i]
    confidence = test_probabilities[i][predicted_class]
    print(f'Prediction: {label_encoder.inverse_transform([predicted_class])[0]}, Confidence: {confidence:.2f}')

# Applying a confidence threshold (e.g., 0.75)
confidence_threshold = 0.75
high_confidence_predictions = [(pred, conf) for pred, conf in zip(test_predictions, test_probabilities.max(axis=1)) if conf >= confidence_threshold]
low_confidence_predictions = [(pred, conf) for pred, conf in zip(test_predictions, test_probabilities.max(axis=1)) if conf < confidence_threshold]

print(f"\nNumber of high-confidence predictions: {len(high_confidence_predictions)}")
print(f"Number of low-confidence predictions: {len(low_confidence_predictions)}")

Training Accuracy: 100.00%
Testing Accuracy: 90.02%

Classification Report:
                          precision    recall  f1-score   support

          complimentary       0.00      0.00      0.00         2
              corporate       0.00      0.00      0.00         1
corporate conv. diretto       1.00      0.50      0.67         2
      corporate dynamic       0.00      0.00      0.00         1
       corporate_direct       1.00      0.75      0.86        16
                 dayuse       0.80      0.55      0.65        64
                easyjet       0.00      0.00      0.00         1
                    fit       1.00      1.00      1.00         3
              fix - tbd       0.71      0.74      0.73       156
                 groups       0.82      0.63      0.71        51
          groups direct       0.91      0.90      0.91       483
           groups sport       0.93      0.97      0.95      1038
   groups tour operator       0.92      0.92      0.92       595
            



Best accuracies:

Training Accuracy: 100.00%

Testing Accuracy: 90.02%