In [None]:
# pip install gensim
# pip install scikit-learn
# pip install torch torchvision torchaudio -f https://download.pytorch.org/whl/torch_stable.html

In [3]:
import os
import datetime
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec


device = torch.device("cuda:0")

""" #CUDA
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using GPU for computation.")
    # Get the number of available GPUs
    num_gpus = torch.cuda.device_count()
    print("Number of available GPUs:", num_gpus)
else:
    device = torch.device("cpu")
    print("Using CPU.") 
"""

' #CUDA\nif torch.cuda.is_available():\n    device = torch.device("cuda")\n    print("Using GPU for computation.")\n    # Get the number of available GPUs\n    num_gpus = torch.cuda.device_count()\n    print("Number of available GPUs:", num_gpus)\nelse:\n    device = torch.device("cpu")\n    print("Using CPU.") \n'

In [5]:
# Load preprocessed datasate into pandas Dataframe
df = pd.read_csv("data/t2p1-customer_booking_mapped.csv", encoding="ISO-8859-1")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   num_passengers         50000 non-null  int64  
 1   sales_channel          50000 non-null  object 
 2   trip_type              50000 non-null  object 
 3   purchase_lead          50000 non-null  int64  
 4   length_of_stay         50000 non-null  int64  
 5   flight_hour            50000 non-null  int64  
 6   flight_day             50000 non-null  int64  
 7   route                  50000 non-null  object 
 8   booking_origin         50000 non-null  object 
 9   wants_extra_baggage    50000 non-null  int64  
 10  wants_preferred_seat   50000 non-null  int64  
 11  wants_in_flight_meals  50000 non-null  int64  
 12  flight_duration        50000 non-null  float64
 13  booking_complete       50000 non-null  int64  
dtypes: float64(1), int64(9), object(4)
memory usage: 5.3+ 

In [None]:
# in Task2-p1-initial-EDA
''' 
mapping = {
    "Mon": 1,
    "Tue": 2,
    "Wed": 3,
    "Thu": 4,
    "Fri": 5,
    "Sat": 6,
    "Sun": 7,
}

df["flight_day"] = df["flight_day"].map(mapping)
'''

# Pipeline Creation
## Hot Encoding

In [3]:
# Function to one-hot encode 'sales_channel' column
def transform_sales_channel(df):
    one_hot_encoded = pd.get_dummies(df['sales_channel'], prefix='sales_channel_HOT')
    return pd.concat([df, one_hot_encoded], axis=1)

# Function to one-hot encode 'trip_type' column
def transform_trip_type(df):
    one_hot_encoded = pd.get_dummies(df['trip_type'], prefix='trip_type_HOT')
    return pd.concat([df, one_hot_encoded], axis=1)


# Apply HOT ENCODING transformations
if 'sales_channel' in df:
    df = transform_sales_channel(df)
    df = df.drop('sales_channel', axis=1)
if 'trip_type' in df:
    df = transform_trip_type(df)
    df = df.drop('trip_type', axis=1)


## Word Embedddings

In [4]:
if 'route_part1' not in df and 'route_part2' not in df:
    # Split the 'route' column into two separate columns
    df['route_part1'] = df['route'].str[:3]
    df['route_part2'] = df['route'].str[-3:]
    # Get unique values from 'route_part1' and 'route_part2'
    unique_values_part1 = df['route_part1'].unique().tolist()
    unique_values_part2 = df['route_part2'].unique().tolist()
    # Combine the unique values from both columns
    combined_unique_values = unique_values_part1 + unique_values_part2
    # Get the unique values from the combined list and assign to 'unique_route_ICAO'
    unique_route_ICAO = list(set(combined_unique_values))
    #print(unique_route_ICAO)

    # route
    unique_route = df['route'].unique().tolist()
    #print(unique_route)

    # booking_origin
    unique_booking_origin = df['booking_origin'].unique().tolist()
    #print(unique_booking_origin)

In [5]:
if os.path.isfile("Word2Vec_model_route") and os.path.isfile("Word2Vec_model_route_ICAO") and os.path.isfile("Word2Vec_model_booking_origin"):
    # Check if the models exist
    # Load the existing models
    model_route = Word2Vec.load("Word2Vec_model_route")
    model_route_ICAO = Word2Vec.load("Word2Vec_model_route_ICAO")
    model_booking_origin = Word2Vec.load("Word2Vec_model_booking_origin")
else:
    # Create and train a Word2Vec model for route
    model_route = Word2Vec(sentences=[unique_route], vector_size=1, window=5, min_count=1, sg=0)
    model_route.save("Word2Vec_model_route")

    # Create and train a Word2Vec model for route_ICAO
    model_route_ICAO = Word2Vec(sentences=[unique_route_ICAO], vector_size=1, window=5, min_count=1, sg=0)
    model_route_ICAO.save("Word2Vec_model_route_ICAO")

    # Create and train a Word2Vec model for booking_origin
    model_booking_origin = Word2Vec(sentences=[unique_booking_origin], vector_size=1, window=5, min_count=1, sg=0)
    model_booking_origin.save("Word2Vec_model_booking_origin")


def get_word_embedding_vector(model, word):
    if word in model.wv:
        return model.wv[word]  # Word found, return its embedding
    else:
        return 0  # Word not found, return an error number (0 in this example)
    
def get_word_embedding_float(model, word):
    if word in model.wv:
        return model.wv[word][0]  # Return the first element of the word embedding list 
    else:
        return 0  # Return 0 if the word is not found

In [6]:
def transform_string_to_ints(df):
    # Convert the 'booking_origin' column using model_booking_origin
    df['booking_origin'] = df['booking_origin'].apply(lambda x: get_word_embedding_float(model_booking_origin, x))

    # Convert the 'route_part2' column using model_route_ICAO
    df['route_part2'] = df['route_part2'].apply(lambda x: get_word_embedding_float(model_route_ICAO, x))

    # Convert the 'route_part1' column using model_route_ICAO
    df['route_part1'] = df['route_part1'].apply(lambda x: get_word_embedding_float(model_route_ICAO, x))

    # Convert the 'route' column using model_route
    df['route'] = df['route'].apply(lambda x: get_word_embedding_float(model_route, x))


transform_string_to_ints(df)

In [7]:
# Get the 'booking_complete' column
booking_complete_column = df['booking_complete']
# Drop the 'booking_complete' column from the DataFrame
df = df.drop('booking_complete', axis=1)
# Add the 'booking_complete' column as the last column
df['booking_complete'] = booking_complete_column

df.info()

# Saving Datasett
df.to_csv('data/t2p2-bool-numerical.csv', sep=",", index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 19 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   num_passengers              50000 non-null  int64  
 1   purchase_lead               50000 non-null  int64  
 2   length_of_stay              50000 non-null  int64  
 3   flight_hour                 50000 non-null  int64  
 4   flight_day                  50000 non-null  int64  
 5   route                       50000 non-null  float32
 6   booking_origin              50000 non-null  float32
 7   wants_extra_baggage         50000 non-null  int64  
 8   wants_preferred_seat        50000 non-null  int64  
 9   wants_in_flight_meals       50000 non-null  int64  
 10  flight_duration             50000 non-null  float64
 11  sales_channel_HOT_Internet  50000 non-null  bool   
 12  sales_channel_HOT_Mobile    50000 non-null  bool   
 13  trip_type_HOT_CircleTrip    500