In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('travel.csv')
data

Unnamed: 0,Trip ID,Destination,Start date,End date,Duration (days),Traveler name,Traveler age,Traveler gender,Traveler nationality,Accommodation type,Accommodation cost,Transportation type,Transportation cost
0,1,"London, UK",5/1/2023,5/8/2023,7.0,John Smith,35.0,Male,American,Hotel,1200,Flight,600
1,2,"Phuket, Thailand",6/15/2023,6/20/2023,5.0,Jane Doe,28.0,Female,Canadian,Resort,800,Flight,500
2,3,"Bali, Indonesia",7/1/2023,7/8/2023,7.0,David Lee,45.0,Male,Korean,Villa,1000,Flight,700
3,4,"New York, USA",8/15/2023,8/29/2023,14.0,Sarah Johnson,29.0,Female,British,Hotel,2000,Flight,1000
4,5,"Tokyo, Japan",9/10/2023,9/17/2023,7.0,Kim Nguyen,26.0,Female,Vietnamese,Airbnb,700,Train,200
...,...,...,...,...,...,...,...,...,...,...,...,...,...
134,135,"Rio de Janeiro, Brazil",8/1/2023,8/10/2023,9.0,Jose Perez,37.0,Male,Brazilian,Hostel,2500,Car,2000
135,136,"Vancouver, Canada",8/15/2023,8/21/2023,6.0,Emma Wilson,29.0,Female,Canadian,Hotel,5000,Airplane,3000
136,137,"Bangkok, Thailand",9/1/2023,9/8/2023,7.0,Ryan Chen,34.0,Male,Chinese,Hostel,2000,Train,1000
137,138,"Barcelona, Spain",9/15/2023,9/22/2023,7.0,Sofia Rodriguez,25.0,Female,Spanish,Airbnb,6000,Airplane,2500


In [3]:
#Remove non-relevant columns
data = data.drop(columns=['Trip ID', 'Start date', 'End date', 'Traveler name'])
data = data.dropna() #remove null columns
data = data.reset_index()
data

Unnamed: 0,index,Destination,Duration (days),Traveler age,Traveler gender,Traveler nationality,Accommodation type,Accommodation cost,Transportation type,Transportation cost
0,0,"London, UK",7.0,35.0,Male,American,Hotel,1200,Flight,600
1,1,"Phuket, Thailand",5.0,28.0,Female,Canadian,Resort,800,Flight,500
2,2,"Bali, Indonesia",7.0,45.0,Male,Korean,Villa,1000,Flight,700
3,3,"New York, USA",14.0,29.0,Female,British,Hotel,2000,Flight,1000
4,4,"Tokyo, Japan",7.0,26.0,Female,Vietnamese,Airbnb,700,Train,200
...,...,...,...,...,...,...,...,...,...,...
131,134,"Rio de Janeiro, Brazil",9.0,37.0,Male,Brazilian,Hostel,2500,Car,2000
132,135,"Vancouver, Canada",6.0,29.0,Female,Canadian,Hotel,5000,Airplane,3000
133,136,"Bangkok, Thailand",7.0,34.0,Male,Chinese,Hostel,2000,Train,1000
134,137,"Barcelona, Spain",7.0,25.0,Female,Spanish,Airbnb,6000,Airplane,2500


In [4]:
print(data['Transportation cost'].unique())
print(data['Accommodation cost'].unique())

['600' '500' '700' '1000' '200' '800' '1200' '100' '400' '150' '$400 '
 '$700 ' '$150 ' '$800 ' '$100 ' '$600 ' '$80 ' '$500 ' '$300 ' '$50 '
 '$120 ' '$75 ' '900' '50' '$200 ' '$250 ' '$20 ' '300' '800 USD'
 '200 USD' '500 USD' '700 USD' '300 USD' '600 USD' '400 USD' '1000 USD'
 '100 USD' '350 USD' '150 USD' '$1,200 ' '$900 ' '$1,500 ' '$1,000 ' '250'
 '2500' '1500' '2000' '3000']
['1200' '800' '1000' '2000' '700' '1500' '500' '900' '2500' '3000' '1400'
 '600' '$900 ' '$1,500 ' '$1,200 ' '$600 ' '$1,000 ' '$700 ' '$400 '
 '$1,400 ' '$2,000 ' '$1,100 ' '$800 ' '$200 ' '$150 ' '$500 ' '$180 '
 '$350 ' '2200' '400' '1100' '300' '1300' '1800' '500 USD' '1000 USD'
 '800 USD' '1200 USD' '400 USD' '600 USD' '900 USD' '1500 USD' '700 USD'
 '300 USD' '200 USD' '$300 ' '200' '5000' '7000' '6000' '4000' '8000']


In [5]:
#Removing non-integer characters from our numbers
data['Transportation cost'] = data['Transportation cost'].replace(",", "", regex=True)
data['Transportation cost'] = data['Transportation cost'].replace({'\$': ''}, regex=True)
data['Transportation cost'] = data['Transportation cost'].replace("USD", "", regex=True)
data['Accommodation cost'] = data['Accommodation cost'].replace(",", "", regex=True)
data['Accommodation cost'] = data['Accommodation cost'].replace({'\$': ''}, regex=True)
data['Accommodation cost'] = data['Accommodation cost'].replace("USD", "", regex=True)

#Converting data into integer form
data['Transportation cost'] = pd.to_numeric(data['Transportation cost'])
data['Accommodation cost'] = pd.to_numeric(data['Accommodation cost'])
print(data['Transportation cost'].unique())
print(data['Accommodation cost'].unique())

#Create the outcome variable column
data['Transportation and Accommodation cost'] = data['Transportation cost'] + data['Accommodation cost']
data = data.drop(columns=['Transportation cost', 'Accommodation cost', 'index'])
data

[ 600  500  700 1000  200  800 1200  100  400  150   80  300   50  120
   75  900  250   20  350 1500 2500 2000 3000]
[1200  800 1000 2000  700 1500  500  900 2500 3000 1400  600  400 1100
  200  150  180  350 2200  300 1300 1800 5000 7000 6000 4000 8000]


Unnamed: 0,Destination,Duration (days),Traveler age,Traveler gender,Traveler nationality,Accommodation type,Transportation type,Transportation and Accommodation cost
0,"London, UK",7.0,35.0,Male,American,Hotel,Flight,1800
1,"Phuket, Thailand",5.0,28.0,Female,Canadian,Resort,Flight,1300
2,"Bali, Indonesia",7.0,45.0,Male,Korean,Villa,Flight,1700
3,"New York, USA",14.0,29.0,Female,British,Hotel,Flight,3000
4,"Tokyo, Japan",7.0,26.0,Female,Vietnamese,Airbnb,Train,900
...,...,...,...,...,...,...,...,...
131,"Rio de Janeiro, Brazil",9.0,37.0,Male,Brazilian,Hostel,Car,4500
132,"Vancouver, Canada",6.0,29.0,Female,Canadian,Hotel,Airplane,8000
133,"Bangkok, Thailand",7.0,34.0,Male,Chinese,Hostel,Train,3000
134,"Barcelona, Spain",7.0,25.0,Female,Spanish,Airbnb,Airplane,8500


In [6]:
#Clean Destination data 
data['Destination'] = data['Destination'].str.lower()
data['Traveler gender'] = data['Traveler gender'].str.lower()
data['Traveler nationality'] = data['Traveler nationality'].str.lower()
data['Accommodation type'] = data['Accommodation type'].str.lower()
data['Transportation type'] = data['Transportation type'].str.lower()

#Sort distination into countries and cities
countries = []
cities = []
unsorted = []
for i in range(len(data['Destination'])):
    cities_countries = data['Destination'][i].split(",")
    #sort instances with both city and country as destination
    if len(cities_countries) == 2:
        if cities_countries[0] not in cities:
            cities.append(cities_countries[0])
        if cities_countries[1] not in countries:
            countries.append(cities_countries[1])
    else:
        unsorted.append(cities_countries)


print("All countries: ", countries)
print("All cities: ", cities)

All countries:  [' uk', ' thailand', ' indonesia', ' usa', ' japan', ' france', ' australia', ' brazil', ' netherlands', ' united arab emirates', ' mexico', ' spain', ' hawaii', ' germany', ' morocco', ' scotland', ' canada', ' aus', ' south korea', ' italy', ' sa', ' thai', ' greece', ' south africa', ' new zealand']
All cities:  ['london', 'phuket', 'bali', 'new york', 'tokyo', 'paris', 'sydney', 'rio de janeiro', 'amsterdam', 'dubai', 'cancun', 'barcelona', 'honolulu', 'berlin', 'marrakech', 'edinburgh', 'new york city', 'bangkok', 'vancouver', 'seoul', 'los angeles', 'rome', 'cape town', 'athens', 'auckland']


In [7]:
#Sort such that each city corresponds to a country(index wise)
cities.remove("new york city")
countries.pop(24)
countries[16] = " thailand"
countries[17] = " canada"
countries[19] = " usa"
countries[20] = " italy"
countries[21] = " south africa"
countries[22] = "  greece"
countries[23] = " new zealand"
print("All countries: ", countries)
print("All cities: ", cities)

All countries:  [' uk', ' thailand', ' indonesia', ' usa', ' japan', ' france', ' australia', ' brazil', ' netherlands', ' united arab emirates', ' mexico', ' spain', ' hawaii', ' germany', ' morocco', ' scotland', ' thailand', ' canada', ' south korea', ' usa', ' italy', ' south africa', '  greece', ' new zealand']
All cities:  ['london', 'phuket', 'bali', 'new york', 'tokyo', 'paris', 'sydney', 'rio de janeiro', 'amsterdam', 'dubai', 'cancun', 'barcelona', 'honolulu', 'berlin', 'marrakech', 'edinburgh', 'bangkok', 'vancouver', 'seoul', 'los angeles', 'rome', 'cape town', 'athens', 'auckland']


In [8]:
#Replace destination with only the country
for i in range(len(data['Destination'])):
    cities_countries = data['Destination'][i].split(",")
    if len(cities_countries) == 2:
        data['Destination'][i] = cities_countries[1]
    else:
        if data['Destination'][i] in cities:
            country = cities.index(data['Destination'][i])
            data['Destination'][i] = countries[country]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Destination'][i] = cities_countries[1]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Destination'][i] = cities_countries[1]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Destination'][i] = cities_countries[1]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Destination'][i] = cities_c

In [9]:
#Clean traveler nationality data
data['Traveler nationality'] = data['Traveler nationality'].replace("south korea", "south korean")
data['Traveler nationality'] = data['Traveler nationality'].replace("taiwan", "taiwanese")
data['Traveler nationality'] = data['Traveler nationality'].replace("canada", "canadian")
data['Traveler nationality'] = data['Traveler nationality'].replace("usa", "american")
data['Traveler nationality'] = data['Traveler nationality'].replace("cambodia", "cambodian")
data['Traveler nationality'] = data['Traveler nationality'].replace("united arab emirates", "emirati")
data['Traveler nationality'] = data['Traveler nationality'].replace("greece", "greek")
data['Traveler nationality'] = data['Traveler nationality'].replace("italy", "italian")
data['Traveler nationality'] = data['Traveler nationality'].replace("singapore", "singaporean")
data['Traveler nationality'] = data['Traveler nationality'].replace("united kingdom", "british")
data['Traveler nationality'] = data['Traveler nationality'].replace("hong kong", "hong kongers")
data['Traveler nationality'] = data['Traveler nationality'].replace("japan", "japanese")
data['Traveler nationality'] = data['Traveler nationality'].replace("spain", "spanish")
data['Traveler nationality'] = data['Traveler nationality'].replace("china", "chinese")
data['Traveler nationality'] = data['Traveler nationality'].replace("uk", "british")
data['Traveler nationality'] = data['Traveler nationality'].replace("germany", "german")
data['Traveler nationality'].value_counts()

Traveler nationality
american         25
british          14
korean           13
canadian         11
spanish           8
chinese           8
australian        8
south korean      6
italian           5
brazilian         4
indian            4
japanese          3
taiwanese         3
vietnamese        3
emirati           3
dutch             2
german            2
mexican           2
south african     2
cambodian         1
hong kongers      1
greek             1
singaporean       1
scottish          1
brazil            1
indonesian        1
french            1
moroccan          1
new zealander     1
Name: count, dtype: int64

In [10]:
#Clean transportation type data
data['Transportation type'] = data['Transportation type'].replace("flight", "plane")
data['Transportation type'] = data['Transportation type'].replace("car rental", "car")
data['Transportation type'] = data['Transportation type'].replace("airplane", "plane")
data['Transportation type'] = data['Transportation type'].replace("subway", "train")
data['Transportation type'].value_counts()

Transportation type
plane    75
train    38
car      16
bus       6
ferry     1
Name: count, dtype: int64

In [11]:
#Using frequency encoding
destination_freq = data['Destination'].value_counts(normalize=True)
data['Destination'] = data['Destination'].map(destination_freq)
nationality_freq = data['Traveler nationality'].value_counts(normalize=True)
data['Traveler nationality'] = data['Traveler nationality'].map(nationality_freq)
gender_freq = data['Traveler gender'].value_counts(normalize=True)
data['Traveler gender'] = data['Traveler gender'].map(gender_freq)

#Using label encoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(data['Transportation type'])
data['Transportation type'] = le.transform(data['Transportation type'])
le.fit(data['Accommodation type'])
data['Accommodation type'] = le.transform(data['Accommodation type'])

In [12]:
data

Unnamed: 0,Destination,Duration (days),Traveler age,Traveler gender,Traveler nationality,Accommodation type,Transportation type,Transportation and Accommodation cost
0,0.051471,7.0,35.0,0.485294,0.183824,3,3,1800
1,0.066176,5.0,28.0,0.514706,0.080882,4,3,1300
2,0.088235,7.0,45.0,0.485294,0.095588,7,3,1700
3,0.080882,14.0,29.0,0.514706,0.102941,3,3,3000
4,0.088235,7.0,26.0,0.514706,0.022059,0,4,900
...,...,...,...,...,...,...,...,...
131,0.036765,9.0,37.0,0.485294,0.029412,2,1,4500
132,0.014706,6.0,29.0,0.514706,0.080882,3,3,8000
133,0.066176,7.0,34.0,0.485294,0.058824,2,4,3000
134,0.044118,7.0,25.0,0.514706,0.058824,0,3,8500


In [13]:
y = data['Transportation and Accommodation cost'] #Outcome variable
X = data.drop(columns=['Transportation and Accommodation cost']) #Features

#Train model with all data
full_model = LinearRegression().fit(X, y)

#Train model and assesing performance with Chi_Sqaure test
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=89)
reg = LinearRegression().fit(X_train, y_train)
reg_predict = reg.predict(X_test).round()

#Calculating Mean Square Error value
y_test = y_test.to_numpy()
SE = 0
for i in range(len(y_test)):
    SE += (y_test[i] - reg_predict[i])**2

MSE = SE/len(y_test)
print(MSE)
X_test

855499.7857142857


Unnamed: 0,Destination,Duration (days),Traveler age,Traveler gender,Traveler nationality,Accommodation type,Transportation type
89,0.058824,11.0,30.0,0.514706,0.036765,2,3
111,0.007353,7.0,27.0,0.485294,0.007353,0,2
104,0.088235,9.0,28.0,0.514706,0.044118,0,4
18,0.051471,6.0,35.0,0.514706,0.102941,3,4
14,0.007353,7.0,26.0,0.514706,0.007353,5,3
41,0.029412,7.0,45.0,0.485294,0.183824,4,3
38,0.102941,7.0,25.0,0.514706,0.183824,3,3
83,0.088235,11.0,42.0,0.514706,0.058824,4,3
96,0.080882,6.0,50.0,0.485294,0.058824,0,1
11,0.044118,7.0,36.0,0.485294,0.058824,0,4
