# Step 1: Load the Data

In [60]:
import pandas as pd

data = pd.read_csv('data/channel_mapping_data.csv')

# Step 2: Explore the Data

In [61]:
print(data.shape)
print('-'*50)
print(data.head())
print('-'*50)
print(data.describe())
print('-'*50)
print(data.dtypes)
print('-'*50)
print('Checking for missing values:')
print(data.isnull().sum())
print('-'*50)
print('Checking for duplicates:')
print(data.duplicated().sum())

(10006, 8)
--------------------------------------------------
  normalized_data                                           raw_data  \
0    Hotel Turist             4U TRAVEL SRL -HOTELTURIST S.P.A.-INAW   
1         Agenzie  Accademia Toscana Del Benessere-Accademia Tosc...   
2  Agenzie Gruppi  Action Advanced Communication-Action Advanced ...   
3         Agenzie            Adria Congrex Srl-Adria Congrex Srl-IND   
4    Agenzie MICE           Adria Congrex Srl-Adria Congrex Srl-MIAD   

          name  chain_name  average_rooms  average_revenue  average_guests  \
0  TH Tirrenia  TH Resorts       1.000000        113.55500       26.000000   
1  TH Tirrenia  TH Resorts       1.875000        133.86125       35.000000   
2  TH Tirrenia  TH Resorts      13.333333       1196.94000      133.333333   
3  TH Tirrenia  TH Resorts      12.000000        823.80000      120.000000   
4  TH Tirrenia  TH Resorts      18.000000       1404.00000      195.000000   

   alos  
0  7.00  
1  1.25  
2  2.0

# Step 3: Data Preprocessing
## Separate features from target

In [62]:
X = data.drop('normalized_data', axis=1)
y = data['normalized_data']

## Standardizing target classes
We have some classes in the target that are the same, but because they slightly differ as strings, will be treated as different classes. Take a look at 'Booking.com' and 'BOOKING.COM', or 'DIRECT ' and 'DIRECT':

In [63]:
print(y.unique())

['Hotel Turist' 'Agenzie' 'Agenzie Gruppi' 'Agenzie MICE' 'Gruppi Agenzie'
 'Gruppi Diretti' 'Booking.com' 'Expedia' 'G2 Travel' 'HotelBeds'
 'Jet2 Holidays' 'Diretto' 'TH Resorts' 'UVET' 'DIRECT CORPORATE' 'TUI'
 'Direct' 'Groups & MICE' 'Airbnb' 'Dayuse' 'HRS' 'Agoda' 'Hotelbeds'
 'BW_DIRECT CORPORATE' 'Direct Website' 'Direct Offline' 'Trip.com'
 'Aci Blueteam' 'Sacchetti Vacanze' 'Operatori locali' 'Gattinoni'
 'Medical Services Genova' "Confindustria Valle d'Aosta" 'Tour Operator'
 'Medical Service Genova' 'Apload' 'Groups' 'Produzioni Cinematografiche'
 'Gruppi diretti' 'BW_DIRECT CONNECT' 'DIRECT' 'Lux Vide' 'Tramp Limited'
 'Agenzia' 'Corporate' 'Non disponibile' 'Sito Web' 'Metasearch' 'Walk in'
 'Fonds Voorzitter' 'Direct Groups' 'Pirelli' 'AGENCY/TO' 'CISALPINA'
 'ITALCAMEL' 'BOOKING.COM' 'DIRECT_CORPORATE' 'HOTELBEDS' 'CTRIP'
 'BW_WHOLESALER' 'EXPEDIA' 'BW_WHOLESALERS' 'WHOLESALERS DYNAMIC'
 'CORPORATE_BW' 'NEGOTIATED' 'DAYUSE' 'CORPORATE' 'TBD' 'HOUSE USE'
 'PARK&FLY' 'TO 

Let's trim and lowercase the target classes to standardize them:

In [64]:
y = y.str.strip().str.lower()
print(y.unique())

['hotel turist' 'agenzie' 'agenzie gruppi' 'agenzie mice' 'gruppi agenzie'
 'gruppi diretti' 'booking.com' 'expedia' 'g2 travel' 'hotelbeds'
 'jet2 holidays' 'diretto' 'th resorts' 'uvet' 'direct corporate' 'tui'
 'direct' 'groups & mice' 'airbnb' 'dayuse' 'hrs' 'agoda'
 'bw_direct corporate' 'direct website' 'direct offline' 'trip.com'
 'aci blueteam' 'sacchetti vacanze' 'operatori locali' 'gattinoni'
 'medical services genova' "confindustria valle d'aosta" 'tour operator'
 'medical service genova' 'apload' 'groups' 'produzioni cinematografiche'
 'bw_direct connect' 'lux vide' 'tramp limited' 'agenzia' 'corporate'
 'non disponibile' 'sito web' 'metasearch' 'walk in' 'fonds voorzitter'
 'direct groups' 'pirelli' 'agency/to' 'cisalpina' 'italcamel'
 'direct_corporate' 'ctrip' 'bw_wholesaler' 'bw_wholesalers'
 'wholesalers dynamic' 'corporate_bw' 'negotiated' 'tbd' 'house use'
 'park&fly' 'to italia' 'agenzia estera' 'agenzia italia' 'ota'
 'alicarl s.n.c. di scandagliato' 'to estera' 't

## Handling missing values

In [65]:
X['chain_name'] = X['chain_name'].fillna('Unknown')
X['raw_data'] = X['raw_data'].fillna('Unknown')

# Display the dataset after filling missing values
print(X.isnull().sum())

raw_data           0
name               0
chain_name         0
average_rooms      0
average_revenue    0
average_guests     0
alos               0
dtype: int64


## Handling features bases on their types

In [66]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

# Define columns
categorical_features = ['raw_data', 'name', 'chain_name']
numerical_features = ['average_rooms', 'average_revenue', 'average_guests', 'alos']

# One-Hot Encoding for categorical features
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Scaling for numerical features
numerical_transformer = StandardScaler()

# Combine all transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features),
    ])

## Encode target variable

In [67]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print('Amount of classes:', len(label_encoder.classes_))

Amount of classes: 94


## Split the data into training and validation sets

In [68]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Step 4: Train a Model

In [69]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

pipeline = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

pipeline.fit(X_train, y_train)

# Step 5: Evaluation

In [70]:
import numpy as np
from sklearn.metrics import classification_report, accuracy_score

# Get predictions with confidence probabilities
train_probabilities = pipeline.predict_proba(X_train)
test_probabilities = pipeline.predict_proba(X_test)

# Convert probabilities to predictions
train_predictions = np.argmax(train_probabilities, axis=1)
test_predictions = np.argmax(test_probabilities, axis=1)

# Evaluation Metrics
train_accuracy = accuracy_score(y_train, train_predictions)
print(f'Training Accuracy: {train_accuracy * 100:.2f}%')
test_accuracy = accuracy_score(y_test, test_predictions)
print(f'Testing Accuracy: {test_accuracy * 100:.2f}%')
print("\nClassification Report:\n", classification_report(y_test, test_predictions, target_names=label_encoder.classes_, labels=np.unique(y_test), zero_division=0))

# Example of how to use the confidence probabilities
for i in range(len(test_predictions)):
    predicted_class = test_predictions[i]
    confidence = test_probabilities[i][predicted_class]
    print(f'Prediction: {label_encoder.inverse_transform([predicted_class])[0]}, Confidence: {confidence:.2f}')

# Applying a confidence threshold (e.g., 0.75)
confidence_threshold = 0.75
high_confidence_predictions = [(pred, conf) for pred, conf in zip(test_predictions, test_probabilities.max(axis=1)) if conf >= confidence_threshold]
low_confidence_predictions = [(pred, conf) for pred, conf in zip(test_predictions, test_probabilities.max(axis=1)) if conf < confidence_threshold]

print(f"\nNumber of high-confidence predictions: {len(high_confidence_predictions)}")
print(f"Number of low-confidence predictions: {len(low_confidence_predictions)}")

Training Accuracy: 25.67%
Testing Accuracy: 10.84%

Classification Report:
                                 precision    recall  f1-score   support

                  aci blueteam       0.00      0.00      0.00         2
                     agency/to       0.00      0.00      0.00         2
                       agenzia       0.00      0.00      0.00         2
                agenzia estera       0.62      0.72      0.67        18
                agenzia estero       0.39      0.55      0.46        33
                agenzia italia       0.40      0.25      0.31         8
                       agenzie       1.00      0.20      0.33         5
                agenzie gruppi       1.00      0.50      0.67         2
                  agenzie mice       0.00      0.00      0.00         1
                         agoda       0.29      0.07      0.12        54
                        airbnb       0.00      0.00      0.00         2
alicarl s.n.c. di scandagliato       0.33      0.41      0.



Best accuracies:

Training Accuracy: 25.67%

Testing Accuracy: 10.84%