In [24]:
import pandas as pd
from datetime import datetime
import re
from sklearn.utils import shuffle

In [25]:
def extract_numeric_value(value_string):
    # Use regular expression to find digits and commas
    matches = re.findall(r'\d+', value_string.replace(',', ''))

    # Join the matches into a single string and convert to float
    numeric_value = float(''.join(matches))
    return numeric_value

def convert_aed_to_mad(amount_in_aed):
    amount_in_aed = extract_numeric_value(amount_in_aed)
    amount_in_mad = amount_in_aed * 2.73
    return amount_in_mad
def convert_price(df):
    for index, row in df.iterrows():
    # Access individual elements using column names
        row['Price'] = convert_aed_to_mad(row['Price'])

In [26]:
def clean_arrival_time(arrival_time):
    if '+' in arrival_time:
        return arrival_time.split('+')[0]
    else:
        return arrival_time

In [27]:
def convert_to_minutes(duration):
    parts = duration.split(' ')
    
    hours = 0
    minutes = 0
    
    if 'h' in parts[0]:
        hours = int(parts[0].replace('h', ''))
    
    if 'm' in parts[-1]:
        minutes = int(parts[-1].replace('m', ''))
    
    return hours * 60 + minutes

In [28]:
def convert_stops_to_numeric(df, column_name):
    mapping_dict = {value: index for index, value in enumerate(df[column_name].unique())}

    df[column_name] = df[column_name].replace(mapping_dict)

    return df


In [29]:
def date_to_season(date_str):
    # Convert the date string to a datetime object
    date_object = datetime.strptime(date_str, '%Y-%m-%d')

    # Extract the month from the datetime object
    month = date_object.month

    # Determine the season based on the month
    if 3 <= month <= 5:
        return 'Spring'
    elif 6 <= month <= 8:
        return 'Summer'
    elif 9 <= month <= 11:
        return 'Autumn'
    else:
        return 'Winter'
def df_to_season(df):
    seasons=[]
    for index, row in df.iterrows():
        # Access individual elements using column names
        row['Season'] = date_to_season(row['Date'])
        seasons.append(row['Season'])
    df['Season']=seasons

In [30]:
def categorize_time(hour):
    # Split the string using ":" as the delimiter
    hours, minutes = map(int, hour.split(':'))
    # Extract the hour part as an integer
    hour_as_int = int(hours)
    if 4 <= hour_as_int < 7:
        return "Early Morning"
    elif 7 <= hour_as_int < 12:
        return "Morning"
    elif 12 <= hour_as_int < 17:
        return "Afternoon"
    elif 17 <= hour_as_int < 20:
        return "Evening"
    elif 20 <= hour_as_int < 24:
        return "Night"
    else:
        return "Late Night"

# Data cleaning

In [31]:
data= pd.read_csv('..\data\\airlines_dataset.csv')

In [32]:
#checking null values
data.isnull().sum()

Airline          0
Source           0
Destination      0
Duration         0
stops            0
class            0
depature time    0
arrival time     0
Price            0
Date             0
dtype: int64

#### Converting price to numeric values 

In [33]:
convert_price(data)

#### Converting duration to minutes

In [34]:
data['Duration'] = data['Duration'].apply(convert_to_minutes)


#### Converting stops to numeric values

In [35]:
data = convert_stops_to_numeric(data, 'stops')

# Feature engineering

#### Converting date to season

In [36]:
df_to_season(data)

#### Categorizing arrival and departure time 

In [37]:
data['arrival time'] = data['arrival time'].apply(clean_arrival_time)
data['arrival time']= data['arrival time'].apply(categorize_time)
data['depature time']= data['depature time'].apply(categorize_time)

#### Converting to the appropriate column type

In [38]:
data['Date']=pd.to_datetime(data['Date'])
data['Price']=data['Price'].astype(float)


In [39]:
data.dtypes

Airline                  object
Source                   object
Destination              object
Duration                  int64
stops                     int64
class                    object
depature time            object
arrival time             object
Price                   float64
Date             datetime64[ns]
Season                   object
dtype: object

# Preprocessed data

In [43]:
data = shuffle(data, random_state=42)
data

Unnamed: 0,Airline,Source,Destination,Duration,stops,class,depature time,arrival time,Price,Date,Season
258,Turkish Airlines,PAR,IST,210,0,business,Evening,Late Night,10540.53,2023-12-09,Winter
1198,Turkish Airlines,ROM,CMN,575,1,economy,Early Morning,Afternoon,2948.40,2024-03-17,Spring
622,Turkish Airlines,PAR,IST,210,0,business,Evening,Late Night,8506.68,2024-01-12,Winter
1210,easyJet,ROM,PAR,130,0,economy,Morning,Afternoon,619.71,2024-03-17,Spring
1613,Transavia France,IST,PAR,230,0,economy,Morning,Morning,1168.44,2024-04-18,Spring
...,...,...,...,...,...,...,...,...,...,...,...
47,Royal Air Maroc,PAR,CMN,185,0,economy,Night,Night,2402.40,2023-12-09,Winter
2829,Lufthansa,ROM,CMN,400,1,business,Evening,Late Night,4641.00,2024-07-04,Summer
4114,easyJet,ROM,PAR,130,0,economy,Afternoon,Evening,616.98,2024-11-05,Autumn
1416,Royal Air Maroc,IST,CMN,350,0,business,Evening,Night,14927.64,2024-03-17,Spring


In [44]:
data.to_csv("../data/preprocessed_data.csv", index=False)