In [40]:
pip install pandas numpy

Note: you may need to restart the kernel to use updated packages.


In [41]:
import pandas as pd

df1 = pd.read_csv("tourism_dataset.csv")
df2 = pd.read_csv("travel_cost.csv")

In [42]:
print("[Tourism Columns]")
print(df1.columns)
print("------------------------------------------------------------------------------------------------------------------------------------")
print("[Travel Cost Columns]")
print(df2.columns)

[Tourism Columns]
Index(['Location', 'Country', 'Category', 'Visitors', 'Rating', 'Revenue',
       'Accommodation_Available'],
      dtype='object')
------------------------------------------------------------------------------------------------------------------------------------
[Travel Cost Columns]
Index(['Trip ID', 'Destination', 'Start date', 'End date', 'Duration (days)',
       'Traveler name', 'Traveler age', 'Traveler gender',
       'Traveler nationality', 'Accommodation type', 'Accommodation cost',
       'Transportation type', 'Transportation cost'],
      dtype='object')


In [43]:
print("[Tourism Dataset Head]")
print(df1.head)
print("------------------------------------------------------------------------------------------------------------------------------------")
print("[Travel Cost Dataset Head]")
print(df2.head)

[Tourism Dataset Head]
<bound method NDFrame.head of         Location Country    Category  Visitors  Rating    Revenue  \
0     kuBZRkVsAR   India      Nature    948853    1.32   84388.38   
1     aHKUXhjzTo     USA  Historical    813627    2.01  802625.60   
2     dlrdYtJFTA  Brazil      Nature    508673    1.42  338777.11   
3     DxmlzdGkHK  Brazil  Historical    623329    1.09  295183.60   
4     WJCCQlepnz  France    Cultural    124867    1.43  547893.24   
...          ...     ...         ...       ...     ...        ...   
5984  xAzwnVKAqz     USA       Urban    828137    1.97  132848.78   
5985  IfKotyaJFC  France      Nature    276317    3.53  325183.96   
5986  bPyubCWGgA   Egypt       Beach    809198    3.37  927336.50   
5987  kkWIucpBnu   Egypt    Cultural    808303    2.52  115791.43   
5988  gHXUrdticm  France    Cultural     40939    4.65  957026.85   

     Accommodation_Available  
0                        Yes  
1                         No  
2                        

In [44]:
null_rows = df2[df2["Destination"].isna()]
print(null_rows)

     Trip ID Destination Start date End date  Duration (days) Traveler name  \
71        72         NaN        NaN      NaN              NaN           NaN   
127      128         NaN        NaN      NaN              NaN           NaN   

     Traveler age Traveler gender Traveler nationality Accommodation type  \
71            NaN             NaN                  NaN                NaN   
127           NaN             NaN                  NaN                NaN   

    Accommodation cost Transportation type Transportation cost  
71                 NaN                 NaN                 NaN  
127                NaN                 NaN                 NaN  


In [45]:
df2 = df2.dropna()

null_rows = df2[df2["Destination"].isna()]
if null_rows.empty:
    print("No null values remaining.")
else:
    print(null_rows)

No null values remaining.


In [46]:
print(df2["Destination"])

0                  London, UK
1            Phuket, Thailand
2             Bali, Indonesia
3               New York, USA
4                Tokyo, Japan
                ...          
134    Rio de Janeiro, Brazil
135         Vancouver, Canada
136         Bangkok, Thailand
137          Barcelona, Spain
138     Auckland, New Zealand
Name: Destination, Length: 136, dtype: object


In [47]:
df2["country"] = df2["Destination"].apply(
    lambda x: str(x).split(",")[-1].strip() if isinstance(x, str) and "," in x else str(x).strip()
)

In [48]:
print(df2["country"])

0               UK
1         Thailand
2        Indonesia
3              USA
4            Japan
          ...     
134         Brazil
135         Canada
136       Thailand
137          Spain
138    New Zealand
Name: country, Length: 136, dtype: object


Now that we have made the destinations more uniform, we can look at the accomodation and travel prices from travel_cost.csv.

We know that the travel_cost dataset has two separate columns listing the different costs associated with different accommodation and transportation types. For the purpose of this assignment, we can just use generalized dummy data, so we will just be combining these columns to create an 'Estimated Costs' column.

In [51]:
print(df2["Accommodation cost"])

0      1200
1       800
2      1000
3      2000
4       700
       ... 
134    2500
135    5000
136    2000
137    6000
138    7000
Name: Accommodation cost, Length: 136, dtype: object


In [58]:
print(df2["Transportation cost"])

0       600
1       500
2       700
3      1000
4       200
       ... 
134    2000
135    3000
136    1000
137    2500
138    2500
Name: Transportation cost, Length: 136, dtype: object


At first glance it may seem like these are all uniform values and we're all set to start working with the data...

In [57]:
print(df2["Accommodation cost"][17])
print(df2["Transportation cost"][75])

$1,500 
700 USD


But digging deeper into the data, you can see some of the values are listed with a "$" or followed by " USD" to specify American Dollars, as well as comma separated costs. Let's strip these to simplify the columns.

In [75]:
df2['Accommodation cost'] = df2['Accommodation cost'].replace({r'\$': '', ' USD': '', ',': ''}, regex=True).astype(float)
df2['Transportation cost'] = df2['Transportation cost'].replace({r'\$': '', ' USD': '', ',': ''}, regex=True).astype(float)

In [76]:
print(df2["Accommodation cost"][17])
print(df2["Transportation cost"][75])

1500.0
700.0


Now our accommodation and transportation costs are uniform, we can create our estimated cost column

In [None]:
df2["Estimated cost"] = df2["Accommodation cost"] + df2["Transportation cost"]
