In [2]:
#loading necessary libraries and data
import plotly.express as px
import pandas as pd

#LOAD THE DATASET
df = pd.read_csv('travel_details.csv')
print("Dataset and Libraries Loaded")

Dataset and Libraries Loaded


In [3]:
#overviewing the data
df.head()

Unnamed: 0,Trip ID,Destination,Start date,End date,Duration (days),Traveler name,Traveler age,Traveler gender,Traveler nationality,Accommodation type,Accommodation cost,Transportation type,Transportation cost
0,1,"London, UK",05-01-2023,05-08-2023,7.0,John Smith,35.0,Male,American,Hotel,1200,Flight,600
1,2,"Phuket, Thailand",6/15/2023,6/20/2023,5.0,Jane Doe,28.0,Female,Canadian,Resort,800,Flight,500
2,3,"Bali, Indonesia",07-01-2023,07-08-2023,7.0,David Lee,45.0,Male,Korean,Villa,1000,Flight,700
3,4,"New York, USA",8/15/2023,8/29/2023,14.0,Sarah Johnson,29.0,Female,British,Hotel,2000,Flight,1000
4,5,"Tokyo, Japan",09-10-2023,9/17/2023,7.0,Kim Nguyen,26.0,Female,Vietnamese,Airbnb,700,Train,200


In [4]:
df.shape

(139, 13)

In [5]:
df.describe()

Unnamed: 0,Trip ID,Duration (days),Traveler age
count,139.0,137.0,137.0
mean,70.0,7.605839,33.175182
std,40.269923,1.601276,7.145441
min,1.0,5.0,20.0
25%,35.5,7.0,28.0
50%,70.0,7.0,31.0
75%,104.5,8.0,38.0
max,139.0,14.0,60.0


In [6]:
df.columns

Index(['Trip ID', 'Destination', 'Start date', 'End date', 'Duration (days)',
       'Traveler name', 'Traveler age', 'Traveler gender',
       'Traveler nationality', 'Accommodation type', 'Accommodation cost',
       'Transportation type', 'Transportation cost'],
      dtype='object')

In [7]:
#DATA CLEANING
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139 entries, 0 to 138
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Trip ID               139 non-null    int64  
 1   Destination           137 non-null    object 
 2   Start date            137 non-null    object 
 3   End date              137 non-null    object 
 4   Duration (days)       137 non-null    float64
 5   Traveler name         137 non-null    object 
 6   Traveler age          137 non-null    float64
 7   Traveler gender       137 non-null    object 
 8   Traveler nationality  137 non-null    object 
 9   Accommodation type    137 non-null    object 
 10  Accommodation cost    137 non-null    object 
 11  Transportation type   136 non-null    object 
 12  Transportation cost   136 non-null    object 
dtypes: float64(2), int64(1), object(10)
memory usage: 14.2+ KB


In [8]:
#We can see that the end date is in object data type. we are gonna fix that so that we can use it in plots without any issues.
df['End date']=pd.to_datetime(df['End date'])

print("Changed date into appropriate format")

Changed date into appropriate format


In [9]:
#CHECKING NULL VALUES
df.isna().sum()

Trip ID                 0
Destination             2
Start date              2
End date                2
Duration (days)         2
Traveler name           2
Traveler age            2
Traveler gender         2
Traveler nationality    2
Accommodation type      2
Accommodation cost      2
Transportation type     3
Transportation cost     3
dtype: int64

In [10]:
#There is only 2 or 3 missing data in the entire dataset so i'm dropping them. It will not the skew the data
df=df.dropna()

# Get the count of non-null values for each column
non_null_counts = df.count()

# Check if all columns have the same count of non-null values
if non_null_counts.nunique() == 1:
    print("Total null values:", df.isnull().sum().sum())
else:
    print("Columns have different counts of non-null values.")


Total null values: 0


In [11]:
list(df['Accommodation cost'].head(20))

['1200',
 '800',
 '1000',
 '2000',
 '700',
 '1500',
 '500',
 '900',
 '1200',
 '2500',
 '1000',
 '800',
 '3000',
 '1400',
 '600',
 '900',
 '$900 ',
 '$1,500 ',
 '$1,200 ',
 '$1,200 ']

In [12]:
#Here we can see that the cost column is very badly fornatted. This holds tru for both transportation as well as accomodation columns. Thus i'm gonna use regular expression to clean the column and connvert them into numbers
import re

# define a regular expression pattern to match numeric values
pattern = re.compile(r'\d+(,\d+)*\.?\d*')

# apply the regular expression pattern to the column and convert the resulting strings to numeric data type
df['Accommodation cost'] = df['Accommodation cost'].apply(lambda x: float(pattern.search(x).group().replace(',', '')) 
                                                          if pattern.search(x) else None)

list(df['Accommodation cost'].head(20))

[1200.0,
 800.0,
 1000.0,
 2000.0,
 700.0,
 1500.0,
 500.0,
 900.0,
 1200.0,
 2500.0,
 1000.0,
 800.0,
 3000.0,
 1400.0,
 600.0,
 900.0,
 900.0,
 1500.0,
 1200.0,
 1200.0]

In [13]:
# apply the regular expression pattern to the column and convert the resulting strings to numeric data type
df['Transportation cost'] = df['Transportation cost'].apply(lambda x: float(pattern.search(x).group().replace(',', '')) 
                                                            if pattern.search(x) else None)

list(df['Transportation cost'].head(20))

[600.0,
 500.0,
 700.0,
 1000.0,
 200.0,
 800.0,
 1200.0,
 600.0,
 200.0,
 800.0,
 500.0,
 100.0,
 1200.0,
 700.0,
 400.0,
 150.0,
 400.0,
 700.0,
 150.0,
 800.0]

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 136 entries, 0 to 138
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   Trip ID               136 non-null    int64         
 1   Destination           136 non-null    object        
 2   Start date            136 non-null    object        
 3   End date              136 non-null    datetime64[ns]
 4   Duration (days)       136 non-null    float64       
 5   Traveler name         136 non-null    object        
 6   Traveler age          136 non-null    float64       
 7   Traveler gender       136 non-null    object        
 8   Traveler nationality  136 non-null    object        
 9   Accommodation type    136 non-null    object        
 10  Accommodation cost    136 non-null    float64       
 11  Transportation type   136 non-null    object        
 12  Transportation cost   136 non-null    float64       
dtypes: datetime64[ns](1)

In [15]:
#Now everthing seems alright lets explore the insights using plots in python
#EDA THROUGH VISUALIZATION
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

# Count the number of trips per destination
trips_per_destination = df['Destination'].value_counts()

# Create a bar chart
fig = px.bar(x=trips_per_destination.index, y=trips_per_destination.values,
             labels={'x': 'Destination', 'y': 'Number of Trips'},
             title='Number of Trips per Destination')
fig.show()

In [16]:
# Count the number of trips per accommodation type
trips_per_accommodation_type = df['Accommodation type'].value_counts()

# Create a pie chart
fig = px.pie(values=trips_per_accommodation_type.values, names=trips_per_accommodation_type.index,
             title='Accommodation Type Distribution')
fig.show()

In [17]:
# Create a scatter plot
fig = px.scatter(df, x='Duration (days)', y='Traveler age', color='Traveler gender',
                 title='Trip Duration vs Traveler Age')
fig.show()

In [18]:
# Convert start date to datetime
df['Start date'] = pd.to_datetime(df['Start date'])

# Group the trips by month
trips_per_month = df.groupby(df['Start date'].dt.strftime('%Y-%m'))['Trip ID'].count()

# Create a line chart
fig = px.line(x=trips_per_month.index, y=trips_per_month.values,
              labels={'x': 'Month', 'y': 'Number of Trips'},
              title='Number of Trips per Month')
fig.show()

In [19]:
import plotly.graph_objects as go

nationalities = df['Traveler nationality'].value_counts()

fig = go.Figure(data=[go.Bar(x=nationalities.index, y=nationalities.values)])

fig.update_layout(title='Traveler Nationalities')

fig.show()

In [20]:
fig = go.Figure()

for accommodation_type in df['Accommodation type'].unique():
    fig.add_trace(go.Box(y=df[df['Accommodation type']==accommodation_type]['Transportation cost'],
                         name=accommodation_type))

fig.update_layout(title='Transportation Costs by Accommodation Type',
                  yaxis_title='Transportation cost')

fig.show()

In [21]:
gender_counts = df['Traveler gender'].value_counts()

fig = go.Figure(data=[go.Pie(labels=gender_counts.index, values=gender_counts.values)])

fig.update_layout(title='Number of Trips per Gender')

fig.show()

In [22]:
fig = px.bar(df, x='Traveler nationality', y='Duration (days)', color='Traveler gender', barmode='stack')

fig.update_layout(title='Trip Duration by Traveler Gender and Nationality')

fig.show()


In [23]:
import numpy as np

heatmap_df = df.pivot_table(index='Destination', columns='Duration (days)', values='Accommodation cost', aggfunc=np.mean)

fig = go.Figure(data=go.Heatmap(z=heatmap_df.values, x=heatmap_df.columns, y=heatmap_df.index, colorscale='Viridis'))

fig.update_layout(title='Trip Duration and Accommodation Cost by Destination',
                  xaxis_title='Duration (days)',
                  yaxis_title='Destination')

fig.show()

In [24]:
fig = px.scatter(df, x='Duration (days)', y='Accommodation cost', color='Traveler gender',
                 size='Transportation cost', hover_data=['Destination', 'Traveler name'])

fig.update_layout(title='Trip Duration, Accommodation Cost, and Transportation Cost by Traveler Gender',
                  xaxis_title='Duration (days)',
                  yaxis_title='Accommodation cost')

fig.show()

In [25]:
#MACHINE LEARNNG
from sklearn.preprocessing import OrdinalEncoder
scaler=OrdinalEncoder()
names=df.columns
d=scaler.fit_transform(df)

newdf = pd.DataFrame(d,columns=names)
newdf.head()

Unnamed: 0,Trip ID,Destination,Start date,End date,Duration (days),Traveler name,Traveler age,Traveler gender,Traveler nationality,Accommodation type,Accommodation cost,Transportation type,Transportation cost
0,0.0,30.0,54.0,59.0,2.0,48.0,13.0,1.0,0.0,3.0,13.0,5.0,13.0
1,1.0,42.0,59.0,64.0,0.0,42.0,6.0,0.0,7.0,4.0,9.0,5.0,12.0
2,2.0,6.0,62.0,68.0,2.0,14.0,22.0,1.0,23.0,7.0,11.0,5.0,14.0
3,3.0,36.0,66.0,76.0,8.0,96.0,7.0,0.0,4.0,3.0,18.0,5.0,17.0
4,4.0,57.0,70.0,79.0,2.0,55.0,4.0,0.0,40.0,0.0,8.0,8.0,7.0


In [27]:
#Importing necessary libraries from their corresponding modules

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score,recall_score
from sklearn.metrics import f1_score

In [32]:
X,y=newdf.drop(["Transportation cost"],axis=1),newdf["Transportation cost"]

In [33]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.33,random_state=42)

In [35]:
print("shape of X_train:",X_train.shape)
print("shape of X_test:",X_test.shape)

shape of X_train: (91, 12)
shape of X_test: (45, 12)


In [36]:
#LOGISTIC REGRESSION
LRC = LogisticRegression(solver="liblinear",max_iter=5000)
LRC.fit(X_train,y_train)
y_pred=LRC.predict(X_test)
print(classification_report(y_test,y_pred))

LRCAcc=accuracy_score(y_pred,y_pred)
print('Logistic Regression accuracy is :{:.2f}%'.format(LRCAcc*100))

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         0
         1.0       0.00      0.00      0.00         1
         3.0       0.00      0.00      0.00         1
         4.0       0.00      0.00      0.00         3
         5.0       0.00      0.00      0.00         0
         6.0       0.00      0.00      0.00         5
         7.0       0.20      0.17      0.18         6
         8.0       0.00      0.00      0.00         1
         9.0       0.14      1.00      0.25         1
        11.0       0.00      0.00      0.00         2
        12.0       0.50      0.17      0.25         6
        13.0       0.00      0.00      0.00         4
        14.0       0.20      0.20      0.20         5
        15.0       0.00      0.00      0.00         2
        16.0       0.00      0.00      0.00         2
        17.0       0.00      0.00      0.00         1
        18.0       0.00      0.00      0.00         1
        19.0       0.00    


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.



In [37]:
#KNearest Neighbors Classifier
knn=KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train,y_train)
pred=knn.predict(X_test)
print(classification_report(y_test,pred))
knnAcc=accuracy_score(pred,y_test)
print('knn accuracy is :{:.2f}%'.format(knnAcc*100))

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       0.0
         1.0       0.00      0.00      0.00       1.0
         2.0       0.00      0.00      0.00       0.0
         3.0       0.00      0.00      0.00       1.0
         4.0       0.00      0.00      0.00       3.0
         6.0       0.00      0.00      0.00       5.0
         7.0       0.00      0.00      0.00       6.0
         8.0       0.00      0.00      0.00       1.0
         9.0       0.00      0.00      0.00       1.0
        10.0       0.00      0.00      0.00       0.0
        11.0       0.00      0.00      0.00       2.0
        12.0       0.00      0.00      0.00       6.0
        13.0       0.00      0.00      0.00       4.0
        14.0       0.00      0.00      0.00       5.0
        15.0       0.00      0.00      0.00       2.0
        16.0       0.00      0.00      0.00       2.0
        17.0       0.00      0.00      0.00       1.0
        18.0       0.00    


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.



In [38]:
#RANDOM FOREST CLASSIFIER
RFC=RandomForestClassifier()
RFC.fit(X_train, y_train)
ypred = RFC.predict(X_test)
print(classification_report(y_test,ypred))

RFCAcc = accuracy_score(ypred,y_test)
print('RandomForest accuracy is :{:.2f}%'.format(RFCAcc*100))

              precision    recall  f1-score   support

         1.0       0.50      1.00      0.67         1
         3.0       0.00      0.00      0.00         1
         4.0       0.20      0.33      0.25         3
         6.0       0.67      0.40      0.50         5
         7.0       0.00      0.00      0.00         6
         8.0       0.00      0.00      0.00         1
         9.0       0.14      1.00      0.25         1
        11.0       0.00      0.00      0.00         2
        12.0       0.00      0.00      0.00         6
        13.0       0.00      0.00      0.00         4
        14.0       0.00      0.00      0.00         5
        15.0       0.07      0.50      0.12         2
        16.0       0.00      0.00      0.00         2
        17.0       0.00      0.00      0.00         1
        18.0       0.00      0.00      0.00         1
        19.0       0.00      0.00      0.00         2
        20.0       0.00      0.00      0.00         0
        21.0       0.00    


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.

