### Tanzania Tourism Prediction

The objective of this project is to develop a machine learning model to predict what a tourist will spend when visiting Tanzania.The model can be used by different tour operators and the Tanzania Tourism Board to automatically help tourists across the world estimate their expenditure before visiting Tanzania.

#### Part 1: Data cleaning and wrangling

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.offline import plot, iplot, init_notebook_mode
import plotly.graph_objs as go
init_notebook_mode(connected=True)
pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv(r'Train.csv')

In [3]:
df

Unnamed: 0,ID,country,age_group,travel_with,total_female,total_male,purpose,main_activity,info_source,tour_arrangement,package_transport_int,package_accomodation,package_food,package_transport_tz,package_sightseeing,package_guided_tour,package_insurance,night_mainland,night_zanzibar,payment_mode,first_trip_tz,most_impressing,total_cost
0,tour_0,SWIZERLAND,45-64,Friends/Relatives,1.0,1.0,Leisure and Holidays,Wildlife tourism,"Friends, relatives",Independent,No,No,No,No,No,No,No,13.0,0.0,Cash,No,Friendly People,674602.5
1,tour_10,UNITED KINGDOM,25-44,,1.0,0.0,Leisure and Holidays,Cultural tourism,others,Independent,No,No,No,No,No,No,No,14.0,7.0,Cash,Yes,"Wonderful Country, Landscape, Nature",3214906.5
2,tour_1000,UNITED KINGDOM,25-44,Alone,0.0,1.0,Visiting Friends and Relatives,Cultural tourism,"Friends, relatives",Independent,No,No,No,No,No,No,No,1.0,31.0,Cash,No,Excellent Experience,3315000.0
3,tour_1002,UNITED KINGDOM,25-44,Spouse,1.0,1.0,Leisure and Holidays,Wildlife tourism,"Travel, agent, tour operator",Package Tour,No,Yes,Yes,Yes,Yes,Yes,No,11.0,0.0,Cash,Yes,Friendly People,7790250.0
4,tour_1004,CHINA,1-24,,1.0,0.0,Leisure and Holidays,Wildlife tourism,"Travel, agent, tour operator",Independent,No,No,No,No,No,No,No,7.0,4.0,Cash,Yes,No comments,1657500.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4804,tour_993,UAE,45-64,Alone,0.0,1.0,Business,Hunting tourism,"Friends, relatives",Independent,No,No,No,No,No,No,No,2.0,0.0,Credit Card,No,No comments,3315000.0
4805,tour_994,UNITED STATES OF AMERICA,25-44,Spouse,1.0,1.0,Leisure and Holidays,Wildlife tourism,"Travel, agent, tour operator",Package Tour,Yes,Yes,Yes,Yes,Yes,Yes,Yes,11.0,0.0,Cash,Yes,Friendly People,10690875.0
4806,tour_995,NETHERLANDS,1-24,,1.0,0.0,Leisure and Holidays,Wildlife tourism,others,Independent,No,No,No,No,No,No,No,3.0,7.0,Cash,Yes,Good service,2246636.7
4807,tour_997,SOUTH AFRICA,25-44,Friends/Relatives,1.0,1.0,Business,Beach tourism,"Travel, agent, tour operator",Independent,Yes,Yes,Yes,No,No,No,No,5.0,0.0,Credit Card,No,Friendly People,1160250.0


In [4]:
df.isna().sum()

ID                          0
country                     0
age_group                   0
travel_with              1114
total_female                3
total_male                  5
purpose                     0
main_activity               0
info_source                 0
tour_arrangement            0
package_transport_int       0
package_accomodation        0
package_food                0
package_transport_tz        0
package_sightseeing         0
package_guided_tour         0
package_insurance           0
night_mainland              0
night_zanzibar              0
payment_mode                0
first_trip_tz               0
most_impressing           313
total_cost                  0
dtype: int64

### Column Analysis

In [5]:
df.columns

Index(['ID', 'country', 'age_group', 'travel_with', 'total_female',
       'total_male', 'purpose', 'main_activity', 'info_source',
       'tour_arrangement', 'package_transport_int', 'package_accomodation',
       'package_food', 'package_transport_tz', 'package_sightseeing',
       'package_guided_tour', 'package_insurance', 'night_mainland',
       'night_zanzibar', 'payment_mode', 'first_trip_tz', 'most_impressing',
       'total_cost'],
      dtype='object')

In [6]:
df.country.unique()

array(['SWIZERLAND', 'UNITED KINGDOM', 'CHINA', 'SOUTH AFRICA',
       'UNITED STATES OF AMERICA', 'NIGERIA', 'INDIA', 'BRAZIL', 'CANADA',
       'MALT', 'MOZAMBIQUE', 'RWANDA', 'AUSTRIA', 'MYANMAR', 'GERMANY',
       'KENYA', 'ALGERIA', 'IRELAND', 'DENMARK', 'SPAIN', 'FRANCE',
       'ITALY', 'EGYPT', 'QATAR', 'MALAWI', 'JAPAN', 'SWEDEN',
       'NETHERLANDS', 'UAE', 'UGANDA', 'AUSTRALIA', 'YEMEN',
       'NEW ZEALAND', 'BELGIUM', 'NORWAY', 'ZIMBABWE', 'ZAMBIA', 'CONGO',
       'BURGARIA', 'PAKISTAN', 'GREECE', 'MAURITIUS', 'DRC', 'OMAN',
       'PORTUGAL', 'KOREA', 'SWAZILAND', 'TUNISIA', 'KUWAIT', 'DOMINICA',
       'ISRAEL', 'FINLAND', 'CZECH REPUBLIC', 'UKRAIN', 'ETHIOPIA',
       'BURUNDI', 'SCOTLAND', 'RUSSIA', 'GHANA', 'NIGER', 'MALAYSIA',
       'COLOMBIA', 'LUXEMBOURG', 'NEPAL', 'POLAND', 'SINGAPORE',
       'LITHUANIA', 'HUNGARY', 'INDONESIA', 'TURKEY', 'TRINIDAD TOBACCO',
       'IRAQ', 'SLOVENIA', 'UNITED ARAB EMIRATES', 'COMORO', 'SRI LANKA',
       'IRAN', 'MONTENEGRO', 

In [7]:
df.age_group.unique()

array(['45-64', '25-44', '1-24', '65+'], dtype=object)

In [8]:
df.travel_with.isna().sum()

1114

In [9]:
df.loc[df.travel_with.isna()]

Unnamed: 0,ID,country,age_group,travel_with,total_female,total_male,purpose,main_activity,info_source,tour_arrangement,package_transport_int,package_accomodation,package_food,package_transport_tz,package_sightseeing,package_guided_tour,package_insurance,night_mainland,night_zanzibar,payment_mode,first_trip_tz,most_impressing,total_cost
1,tour_10,UNITED KINGDOM,25-44,,1.0,0.0,Leisure and Holidays,Cultural tourism,others,Independent,No,No,No,No,No,No,No,14.0,7.0,Cash,Yes,"Wonderful Country, Landscape, Nature",3214906.5
4,tour_1004,CHINA,1-24,,1.0,0.0,Leisure and Holidays,Wildlife tourism,"Travel, agent, tour operator",Independent,No,No,No,No,No,No,No,7.0,4.0,Cash,Yes,No comments,1657500.0
5,tour_1005,UNITED KINGDOM,25-44,,0.0,1.0,Leisure and Holidays,Wildlife tourism,"Travel, agent, tour operator",Package Tour,No,No,No,No,Yes,Yes,No,9.0,3.0,Cash,Yes,Wildlife,120950.0
17,tour_1022,MYANMAR,25-44,,1.0,0.0,Meetings and Conference,Wildlife tourism,"Radio, TV, Web",Independent,No,No,No,No,No,No,No,5.0,0.0,Cash,Yes,Friendly People,331500.0
19,tour_1026,KENYA,25-44,,1.0,0.0,Business,Mountain climbing,"Friends, relatives",Independent,No,No,No,No,No,No,No,4.0,0.0,Cash,No,Friendly People,377520.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4783,tour_969,UNITED STATES OF AMERICA,25-44,,1.0,0.0,Meetings and Conference,Mountain climbing,"Radio, TV, Web",Independent,No,No,No,No,No,No,No,3.0,0.0,Credit Card,No,,497250.0
4788,tour_975,KENYA,25-44,,0.0,1.0,Meetings and Conference,Wildlife tourism,"Friends, relatives",Package Tour,Yes,Yes,Yes,No,No,No,No,6.0,0.0,Cash,No,Friendly People,1657500.0
4791,tour_979,GERMANY,25-44,,1.0,0.0,Scientific and Academic,Beach tourism,"Friends, relatives",Independent,No,No,No,No,No,No,No,47.0,7.0,Credit Card,Yes,"Wonderful Country, Landscape, Nature",4538660.0
4798,tour_985,SWEDEN,45-64,,1.0,0.0,Business,Wildlife tourism,others,Independent,No,No,No,No,No,No,No,3.0,0.0,Cash,No,No comments,1657500.0


In [10]:
df.travel_with.unique()

array(['Friends/Relatives', nan, 'Alone', 'Spouse', 'Children',
       'Spouse and Children'], dtype=object)

In [11]:
df.total_female=df.total_female=df.total_female.astype(float)
df.total_male=df.total_male=df.total_male.astype(float)
df['Total_travelers']=df.total_female + df.total_male

If the total male or female traveling is (0,1) or (1,0) this means they are travelling alone hence we set the NA values to alone.

In [12]:
#df.travel_with.loc[(df.travel_with.isna()) & (df.total_female==0) & (df.total_male==0)]='Alone'
df.travel_with.loc[(df.travel_with.isna()) & (df.total_female==0) & (df.total_male==1)]='Alone'
df.travel_with.loc[(df.travel_with.isna()) & (df.total_female==1) & (df.total_male==0)]='Alone'



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [13]:
df.loc[df.travel_with.isna()]
df.travel_with.isna().sum()

29

Im am now making an assumption that people travelling greater than or equal to 6 are not a family but they are a group of friends or relatives.

In [14]:
df.travel_with.loc[(df.travel_with.isna()) & (df.Total_travelers>=5)]='Friends/Relatives'


In [15]:
df.loc[(df.total_female==1)&(df.total_male==1)]

Unnamed: 0,ID,country,age_group,travel_with,total_female,total_male,purpose,main_activity,info_source,tour_arrangement,package_transport_int,package_accomodation,package_food,package_transport_tz,package_sightseeing,package_guided_tour,package_insurance,night_mainland,night_zanzibar,payment_mode,first_trip_tz,most_impressing,total_cost,Total_travelers
0,tour_0,SWIZERLAND,45-64,Friends/Relatives,1.0,1.0,Leisure and Holidays,Wildlife tourism,"Friends, relatives",Independent,No,No,No,No,No,No,No,13.0,0.0,Cash,No,Friendly People,674602.5,2.0
3,tour_1002,UNITED KINGDOM,25-44,Spouse,1.0,1.0,Leisure and Holidays,Wildlife tourism,"Travel, agent, tour operator",Package Tour,No,Yes,Yes,Yes,Yes,Yes,No,11.0,0.0,Cash,Yes,Friendly People,7790250.0,2.0
7,tour_1008,UNITED STATES OF AMERICA,45-64,Friends/Relatives,1.0,1.0,Leisure and Holidays,Wildlife tourism,"Travel, agent, tour operator",Package Tour,Yes,Yes,Yes,Yes,Yes,Yes,Yes,10.0,3.0,Cash,Yes,Friendly People,3480750.0,2.0
10,tour_1012,BRAZIL,25-44,Spouse,1.0,1.0,Leisure and Holidays,Wildlife tourism,"Radio, TV, Web",Independent,No,No,No,No,No,No,No,17.0,3.0,Cash,Yes,"Wonderful Country, Landscape, Nature",1117155.0,2.0
18,tour_1024,GERMANY,25-44,Children,1.0,1.0,Visiting Friends and Relatives,Cultural tourism,"Friends, relatives",Independent,No,No,No,No,No,No,No,3.0,0.0,Cash,Yes,Friendly People,2269330.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4800,tour_987,SINGAPORE,1-24,Friends/Relatives,1.0,1.0,Leisure and Holidays,Conference tourism,"Friends, relatives",Package Tour,Yes,Yes,No,Yes,No,Yes,No,14.0,2.0,Cash,Yes,Good service,9945000.0,2.0
4801,tour_989,ITALY,45-64,Friends/Relatives,1.0,1.0,Meetings and Conference,Mountain climbing,"Newspaper, magazines,brochures",Independent,No,No,No,No,No,No,No,5.0,0.0,Cash,Yes,"Wonderful Country, Landscape, Nature",2269330.0,2.0
4805,tour_994,UNITED STATES OF AMERICA,25-44,Spouse,1.0,1.0,Leisure and Holidays,Wildlife tourism,"Travel, agent, tour operator",Package Tour,Yes,Yes,Yes,Yes,Yes,Yes,Yes,11.0,0.0,Cash,Yes,Friendly People,10690875.0,2.0
4807,tour_997,SOUTH AFRICA,25-44,Friends/Relatives,1.0,1.0,Business,Beach tourism,"Travel, agent, tour operator",Independent,Yes,Yes,Yes,No,No,No,No,5.0,0.0,Credit Card,No,Friendly People,1160250.0,2.0


In [16]:
px.histogram(df.loc[(df.total_female==1)&(df.total_male==1)],x='travel_with')

As shown from the histogram above, most travellers with a distribution of total_female=1 and total_male=1 most likely are husband and wife. I will then make an assumption that travel with is Spouse for distribution (0,1) or (1,0)

In [17]:
df.travel_with.loc[(df.travel_with.isna())&(df.total_female==1)&(df.total_male==1)]='Spouse'



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [18]:
df.loc[df.travel_with.isna()]

Unnamed: 0,ID,country,age_group,travel_with,total_female,total_male,purpose,main_activity,info_source,tour_arrangement,package_transport_int,package_accomodation,package_food,package_transport_tz,package_sightseeing,package_guided_tour,package_insurance,night_mainland,night_zanzibar,payment_mode,first_trip_tz,most_impressing,total_cost,Total_travelers
681,tour_1837,UNITED STATES OF AMERICA,1-24,,0.0,4.0,Leisure and Holidays,Conference tourism,"Radio, TV, Web",Package Tour,No,Yes,Yes,Yes,No,Yes,No,10.0,0.0,Cash,Yes,No comments,4309500.0,4.0
706,tour_1869,UNITED STATES OF AMERICA,45-64,,0.0,0.0,Leisure and Holidays,Wildlife tourism,"Travel, agent, tour operator",Package Tour,Yes,Yes,Yes,Yes,No,Yes,Yes,7.0,0.0,Cash,Yes,No comments,21713250.0,0.0
1447,tour_2775,UNITED KINGDOM,45-64,,2.0,0.0,Meetings and Conference,Wildlife tourism,others,Independent,No,No,No,No,No,No,No,1.0,0.0,Credit Card,No,Friendly People,497250.0,2.0
1672,tour_3042,ITALY,45-64,,0.0,4.0,Leisure and Holidays,Wildlife tourism,"Travel, agent, tour operator",Package Tour,Yes,Yes,Yes,Yes,No,No,No,4.0,6.0,Cash,Yes,Excellent Experience,340399.5,4.0
2106,tour_3557,KENYA,45-64,,1.0,3.0,Meetings and Conference,Mountain climbing,"Travel, agent, tour operator",Independent,No,No,No,No,No,No,No,1.0,0.0,Cash,No,Good service,100000.0,4.0
2260,tour_375,UNITED STATES OF AMERICA,25-44,,1.0,2.0,Leisure and Holidays,Wildlife tourism,"Radio, TV, Web",Independent,No,No,No,No,No,No,No,7.0,6.0,Cash,Yes,Wildlife,10922925.0,3.0
2440,tour_3972,GERMANY,1-24,,2.0,2.0,Leisure and Holidays,Wildlife tourism,"Travel, agent, tour operator",Package Tour,No,Yes,Yes,Yes,Yes,Yes,No,5.0,0.0,Cash,Yes,Friendly People,23453625.0,4.0
2494,tour_4031,FRANCE,25-44,,0.0,,Leisure and Holidays,Conference tourism,"Radio, TV, Web",Package Tour,Yes,Yes,Yes,Yes,Yes,Yes,Yes,6.0,0.0,Cash,Yes,Excellent Experience,4309500.0,
2872,tour_450,SPAIN,45-64,,1.0,2.0,Leisure and Holidays,Wildlife tourism,"Travel, agent, tour operator",Package Tour,Yes,Yes,Yes,No,Yes,Yes,Yes,11.0,3.0,Cash,Yes,Excellent Experience,6961500.0,3.0
3063,tour_4715,SPAIN,1-24,,2.0,0.0,Volunteering,Wildlife tourism,"Travel, agent, tour operator",Package Tour,No,Yes,Yes,Yes,Yes,Yes,No,30.0,0.0,Cash,Yes,,8536125.0,2.0


In [19]:
df.loc[df.purpose=='Meetings and Conference'].head()

Unnamed: 0,ID,country,age_group,travel_with,total_female,total_male,purpose,main_activity,info_source,tour_arrangement,package_transport_int,package_accomodation,package_food,package_transport_tz,package_sightseeing,package_guided_tour,package_insurance,night_mainland,night_zanzibar,payment_mode,first_trip_tz,most_impressing,total_cost,Total_travelers
17,tour_1022,MYANMAR,25-44,Alone,1.0,0.0,Meetings and Conference,Wildlife tourism,"Radio, TV, Web",Independent,No,No,No,No,No,No,No,5.0,0.0,Cash,Yes,Friendly People,331500.0,1.0
39,tour_1047,MALAWI,25-44,Alone,0.0,1.0,Meetings and Conference,Beach tourism,"Friends, relatives",Independent,No,No,No,No,No,No,No,21.0,0.0,Credit Card,No,Friendly People,3000000.0,1.0
51,tour_1060,UGANDA,25-44,Spouse and Children,2.0,1.0,Meetings and Conference,Beach tourism,"Friends, relatives",Independent,No,No,No,No,No,No,No,12.0,5.0,Cash,No,Friendly People,1000000.0,3.0
55,tour_1065,UNITED STATES OF AMERICA,25-44,Alone,1.0,0.0,Meetings and Conference,Mountain climbing,"Friends, relatives",Independent,No,No,No,No,No,No,No,5.0,0.0,Credit Card,Yes,Excellent Experience,1491750.0,1.0
61,tour_1071,UGANDA,25-44,Children,1.0,1.0,Meetings and Conference,Mountain climbing,"Newspaper, magazines,brochures",Independent,No,No,No,No,No,No,No,5.0,0.0,Cash,Yes,Satisfies and Hope Come Back,700000.0,2.0


In [20]:
px.histogram(df.loc[(df.purpose=='Meetings and Conference') & (df.Total_travelers>=2)],x='travel_with')

In [21]:
df.travel_with.loc[(df.travel_with.isna()) &(df.purpose=='Meetings and Conference') & (df.Total_travelers>=2)]='Friends/Relatives'



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [22]:
df.loc[df.travel_with.isna()]

Unnamed: 0,ID,country,age_group,travel_with,total_female,total_male,purpose,main_activity,info_source,tour_arrangement,package_transport_int,package_accomodation,package_food,package_transport_tz,package_sightseeing,package_guided_tour,package_insurance,night_mainland,night_zanzibar,payment_mode,first_trip_tz,most_impressing,total_cost,Total_travelers
681,tour_1837,UNITED STATES OF AMERICA,1-24,,0.0,4.0,Leisure and Holidays,Conference tourism,"Radio, TV, Web",Package Tour,No,Yes,Yes,Yes,No,Yes,No,10.0,0.0,Cash,Yes,No comments,4309500.0,4.0
706,tour_1869,UNITED STATES OF AMERICA,45-64,,0.0,0.0,Leisure and Holidays,Wildlife tourism,"Travel, agent, tour operator",Package Tour,Yes,Yes,Yes,Yes,No,Yes,Yes,7.0,0.0,Cash,Yes,No comments,21713250.0,0.0
1672,tour_3042,ITALY,45-64,,0.0,4.0,Leisure and Holidays,Wildlife tourism,"Travel, agent, tour operator",Package Tour,Yes,Yes,Yes,Yes,No,No,No,4.0,6.0,Cash,Yes,Excellent Experience,340399.5,4.0
2260,tour_375,UNITED STATES OF AMERICA,25-44,,1.0,2.0,Leisure and Holidays,Wildlife tourism,"Radio, TV, Web",Independent,No,No,No,No,No,No,No,7.0,6.0,Cash,Yes,Wildlife,10922925.0,3.0
2440,tour_3972,GERMANY,1-24,,2.0,2.0,Leisure and Holidays,Wildlife tourism,"Travel, agent, tour operator",Package Tour,No,Yes,Yes,Yes,Yes,Yes,No,5.0,0.0,Cash,Yes,Friendly People,23453625.0,4.0
2494,tour_4031,FRANCE,25-44,,0.0,,Leisure and Holidays,Conference tourism,"Radio, TV, Web",Package Tour,Yes,Yes,Yes,Yes,Yes,Yes,Yes,6.0,0.0,Cash,Yes,Excellent Experience,4309500.0,
2872,tour_450,SPAIN,45-64,,1.0,2.0,Leisure and Holidays,Wildlife tourism,"Travel, agent, tour operator",Package Tour,Yes,Yes,Yes,No,Yes,Yes,Yes,11.0,3.0,Cash,Yes,Excellent Experience,6961500.0,3.0
3063,tour_4715,SPAIN,1-24,,2.0,0.0,Volunteering,Wildlife tourism,"Travel, agent, tour operator",Package Tour,No,Yes,Yes,Yes,Yes,Yes,No,30.0,0.0,Cash,Yes,,8536125.0,2.0
3065,tour_4717,UNITED STATES OF AMERICA,45-64,,2.0,1.0,Leisure and Holidays,Wildlife tourism,"Newspaper, magazines,brochures",Independent,No,No,No,No,No,No,No,12.0,0.0,Cash,Yes,Wildlife,7293000.0,3.0
3412,tour_5122,UNITED STATES OF AMERICA,45-64,,0.0,2.0,Leisure and Holidays,Wildlife tourism,"Travel, agent, tour operator",Package Tour,No,Yes,Yes,No,Yes,Yes,No,5.0,0.0,Cash,Yes,No comments,663000.0,2.0


In [23]:
df.travel_with.loc[(df.travel_with.isna()) & (df.purpose.isin(['Business','Volunteering']))]='Friends/Relatives'   



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



I have just noticed that there are rows with total number of travellers being 0. Lets investigate a little further in these rows.

In [24]:
df.loc[df.Total_travelers==0]

Unnamed: 0,ID,country,age_group,travel_with,total_female,total_male,purpose,main_activity,info_source,tour_arrangement,package_transport_int,package_accomodation,package_food,package_transport_tz,package_sightseeing,package_guided_tour,package_insurance,night_mainland,night_zanzibar,payment_mode,first_trip_tz,most_impressing,total_cost,Total_travelers
170,tour_1205,UNITED STATES OF AMERICA,25-44,Spouse,0.0,0.0,Leisure and Holidays,Wildlife tourism,"Radio, TV, Web",Package Tour,No,Yes,Yes,Yes,No,Yes,No,5.0,4.0,Credit Card,No,Excellent Experience,1657500.0,0.0
705,tour_1867,HUNGARY,45-64,Spouse and Children,0.0,0.0,Leisure and Holidays,Wildlife tourism,"Travel, agent, tour operator",Package Tour,No,Yes,Yes,Yes,Yes,Yes,No,4.0,0.0,Cash,Yes,Wildlife,7646047.5,0.0
706,tour_1869,UNITED STATES OF AMERICA,45-64,,0.0,0.0,Leisure and Holidays,Wildlife tourism,"Travel, agent, tour operator",Package Tour,Yes,Yes,Yes,Yes,No,Yes,Yes,7.0,0.0,Cash,Yes,No comments,21713250.0,0.0
727,tour_1893,UNITED STATES OF AMERICA,45-64,Spouse and Children,0.0,0.0,Leisure and Holidays,Wildlife tourism,"Friends, relatives",Independent,No,No,No,No,No,No,No,20.0,0.0,Cash,Yes,Friendly People,19227000.0,0.0
1154,tour_2414,GERMANY,45-64,Friends/Relatives,0.0,0.0,Leisure and Holidays,Beach tourism,"Newspaper, magazines,brochures",Package Tour,Yes,Yes,Yes,Yes,No,No,No,0.0,33.0,Cash,Yes,Wildlife,67399101.0,0.0
1670,tour_3040,ZIMBABWE,1-24,Alone,0.0,0.0,Business,Beach tourism,"Newspaper, magazines,brochures",Independent,No,No,No,No,No,No,No,2.0,0.0,Cash,No,No comments,500000.0,0.0
2243,tour_3730,UNITED STATES OF AMERICA,65+,Spouse,0.0,0.0,Leisure and Holidays,Wildlife tourism,"Travel, agent, tour operator",Package Tour,No,Yes,Yes,Yes,No,No,No,14.0,0.0,Cash,Yes,Wildlife,16939650.0,0.0
2335,tour_3840,SPAIN,25-44,Alone,0.0,0.0,Leisure and Holidays,Wildlife tourism,"Travel, agent, tour operator",Package Tour,No,Yes,Yes,Yes,No,Yes,No,4.0,0.0,Cash,Yes,"Wonderful Country, Landscape, Nature",447525.0,0.0
2430,tour_396,AUSTRALIA,1-24,Alone,0.0,0.0,Leisure and Holidays,Hunting tourism,"Travel, agent, tour operator",Independent,No,No,No,No,No,No,No,15.0,5.0,Cash,Yes,Friendly People,11027030.0,0.0
2661,tour_425,SPAIN,45-64,Friends/Relatives,0.0,0.0,Leisure and Holidays,Wildlife tourism,"Friends, relatives",Package Tour,No,Yes,Yes,Yes,Yes,Yes,No,5.0,6.0,Cash,Yes,Friendly People,19755742.5,0.0


I can easily interpolate for spouse since the distribution of total feamle and total male will be (1,1) but for others that will be difficult. 13 rows will not affect our model so I will drop these rows.

In [25]:
df[['total_female','total_male']].loc[(df.Total_travelers==0) & (df.travel_with=='Spouse')]

Unnamed: 0,total_female,total_male
170,0.0,0.0
2243,0.0,0.0


In [26]:
df.loc[(df.Total_travelers==0) & (df.travel_with=='Spouse'), "total_female"] = 1
df.loc[(df.Total_travelers==0) & (df.travel_with=='Spouse'), "total_male"] = 1

In [27]:
df.loc[(df.Total_travelers==0) & (df.travel_with=='Spouse'), ["total_female", "total_male"]] = 1

In [28]:
#updating total_travellers to 2
df.loc[(df.Total_travelers==0) & (df.travel_with=='Spouse'), 'Total_travelers']=2

In [29]:
#dropping the 13 rows
df=df[df.Total_travelers!=0]

In [30]:
df.loc[df.travel_with.isna()]

Unnamed: 0,ID,country,age_group,travel_with,total_female,total_male,purpose,main_activity,info_source,tour_arrangement,package_transport_int,package_accomodation,package_food,package_transport_tz,package_sightseeing,package_guided_tour,package_insurance,night_mainland,night_zanzibar,payment_mode,first_trip_tz,most_impressing,total_cost,Total_travelers
681,tour_1837,UNITED STATES OF AMERICA,1-24,,0.0,4.0,Leisure and Holidays,Conference tourism,"Radio, TV, Web",Package Tour,No,Yes,Yes,Yes,No,Yes,No,10.0,0.0,Cash,Yes,No comments,4309500.0,4.0
1672,tour_3042,ITALY,45-64,,0.0,4.0,Leisure and Holidays,Wildlife tourism,"Travel, agent, tour operator",Package Tour,Yes,Yes,Yes,Yes,No,No,No,4.0,6.0,Cash,Yes,Excellent Experience,340399.5,4.0
2260,tour_375,UNITED STATES OF AMERICA,25-44,,1.0,2.0,Leisure and Holidays,Wildlife tourism,"Radio, TV, Web",Independent,No,No,No,No,No,No,No,7.0,6.0,Cash,Yes,Wildlife,10922925.0,3.0
2440,tour_3972,GERMANY,1-24,,2.0,2.0,Leisure and Holidays,Wildlife tourism,"Travel, agent, tour operator",Package Tour,No,Yes,Yes,Yes,Yes,Yes,No,5.0,0.0,Cash,Yes,Friendly People,23453625.0,4.0
2494,tour_4031,FRANCE,25-44,,0.0,,Leisure and Holidays,Conference tourism,"Radio, TV, Web",Package Tour,Yes,Yes,Yes,Yes,Yes,Yes,Yes,6.0,0.0,Cash,Yes,Excellent Experience,4309500.0,
2872,tour_450,SPAIN,45-64,,1.0,2.0,Leisure and Holidays,Wildlife tourism,"Travel, agent, tour operator",Package Tour,Yes,Yes,Yes,No,Yes,Yes,Yes,11.0,3.0,Cash,Yes,Excellent Experience,6961500.0,3.0
3065,tour_4717,UNITED STATES OF AMERICA,45-64,,2.0,1.0,Leisure and Holidays,Wildlife tourism,"Newspaper, magazines,brochures",Independent,No,No,No,No,No,No,No,12.0,0.0,Cash,Yes,Wildlife,7293000.0,3.0
3412,tour_5122,UNITED STATES OF AMERICA,45-64,,0.0,2.0,Leisure and Holidays,Wildlife tourism,"Travel, agent, tour operator",Package Tour,No,Yes,Yes,No,Yes,Yes,No,5.0,0.0,Cash,Yes,No comments,663000.0,2.0
4694,tour_853,UNITED STATES OF AMERICA,45-64,,0.0,3.0,Leisure and Holidays,Conference tourism,"Newspaper, magazines,brochures",Package Tour,No,Yes,Yes,Yes,Yes,Yes,No,13.0,0.0,Cash,Yes,Good service,39780000.0,3.0


Im now assuming that if total_female >=1 and total_male >=1 they are traveling with spouse and children

In [31]:
df.loc[(df.travel_with.isna()) & ((df.total_female>=1) & (df.total_male>=1)),'travel_with']='Spouse and Children'




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [32]:
df.loc[df.travel_with.isna()]

Unnamed: 0,ID,country,age_group,travel_with,total_female,total_male,purpose,main_activity,info_source,tour_arrangement,package_transport_int,package_accomodation,package_food,package_transport_tz,package_sightseeing,package_guided_tour,package_insurance,night_mainland,night_zanzibar,payment_mode,first_trip_tz,most_impressing,total_cost,Total_travelers
681,tour_1837,UNITED STATES OF AMERICA,1-24,,0.0,4.0,Leisure and Holidays,Conference tourism,"Radio, TV, Web",Package Tour,No,Yes,Yes,Yes,No,Yes,No,10.0,0.0,Cash,Yes,No comments,4309500.0,4.0
1672,tour_3042,ITALY,45-64,,0.0,4.0,Leisure and Holidays,Wildlife tourism,"Travel, agent, tour operator",Package Tour,Yes,Yes,Yes,Yes,No,No,No,4.0,6.0,Cash,Yes,Excellent Experience,340399.5,4.0
2494,tour_4031,FRANCE,25-44,,0.0,,Leisure and Holidays,Conference tourism,"Radio, TV, Web",Package Tour,Yes,Yes,Yes,Yes,Yes,Yes,Yes,6.0,0.0,Cash,Yes,Excellent Experience,4309500.0,
3412,tour_5122,UNITED STATES OF AMERICA,45-64,,0.0,2.0,Leisure and Holidays,Wildlife tourism,"Travel, agent, tour operator",Package Tour,No,Yes,Yes,No,Yes,Yes,No,5.0,0.0,Cash,Yes,No comments,663000.0,2.0
4694,tour_853,UNITED STATES OF AMERICA,45-64,,0.0,3.0,Leisure and Holidays,Conference tourism,"Newspaper, magazines,brochures",Package Tour,No,Yes,Yes,Yes,Yes,Yes,No,13.0,0.0,Cash,Yes,Good service,39780000.0,3.0


In [33]:
#Dropping index 2494 because travel_with,total_female and total_male all have no values
df.drop(labels=2494,inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [34]:
df.loc[df.travel_with.isna()]

Unnamed: 0,ID,country,age_group,travel_with,total_female,total_male,purpose,main_activity,info_source,tour_arrangement,package_transport_int,package_accomodation,package_food,package_transport_tz,package_sightseeing,package_guided_tour,package_insurance,night_mainland,night_zanzibar,payment_mode,first_trip_tz,most_impressing,total_cost,Total_travelers
681,tour_1837,UNITED STATES OF AMERICA,1-24,,0.0,4.0,Leisure and Holidays,Conference tourism,"Radio, TV, Web",Package Tour,No,Yes,Yes,Yes,No,Yes,No,10.0,0.0,Cash,Yes,No comments,4309500.0,4.0
1672,tour_3042,ITALY,45-64,,0.0,4.0,Leisure and Holidays,Wildlife tourism,"Travel, agent, tour operator",Package Tour,Yes,Yes,Yes,Yes,No,No,No,4.0,6.0,Cash,Yes,Excellent Experience,340399.5,4.0
3412,tour_5122,UNITED STATES OF AMERICA,45-64,,0.0,2.0,Leisure and Holidays,Wildlife tourism,"Travel, agent, tour operator",Package Tour,No,Yes,Yes,No,Yes,Yes,No,5.0,0.0,Cash,Yes,No comments,663000.0,2.0
4694,tour_853,UNITED STATES OF AMERICA,45-64,,0.0,3.0,Leisure and Holidays,Conference tourism,"Newspaper, magazines,brochures",Package Tour,No,Yes,Yes,Yes,Yes,Yes,No,13.0,0.0,Cash,Yes,Good service,39780000.0,3.0


Now assuming the last group is travelling with Friends/Relatives

In [35]:
df.loc[df.travel_with.isna(),'travel_with']='Friends/Relatives'



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [36]:
#travel with column is now clean, just out of curiosity would want to see the distribution
df.travel_with.unique()

array(['Friends/Relatives', 'Alone', 'Spouse', 'Children',
       'Spouse and Children'], dtype=object)

In [37]:
px.histogram(df,x='travel_with')

Most tourists travel alone, thats interesting.

In [38]:
df.columns

Index(['ID', 'country', 'age_group', 'travel_with', 'total_female',
       'total_male', 'purpose', 'main_activity', 'info_source',
       'tour_arrangement', 'package_transport_int', 'package_accomodation',
       'package_food', 'package_transport_tz', 'package_sightseeing',
       'package_guided_tour', 'package_insurance', 'night_mainland',
       'night_zanzibar', 'payment_mode', 'first_trip_tz', 'most_impressing',
       'total_cost', 'Total_travelers'],
      dtype='object')

In [39]:
df.total_female.isna().sum()

3

In [40]:
df.loc[df.total_female.isna()]

Unnamed: 0,ID,country,age_group,travel_with,total_female,total_male,purpose,main_activity,info_source,tour_arrangement,package_transport_int,package_accomodation,package_food,package_transport_tz,package_sightseeing,package_guided_tour,package_insurance,night_mainland,night_zanzibar,payment_mode,first_trip_tz,most_impressing,total_cost,Total_travelers
490,tour_1604,FRANCE,25-44,Friends/Relatives,,1.0,Leisure and Holidays,Wildlife tourism,"Travel, agent, tour operator",Package Tour,Yes,Yes,Yes,Yes,Yes,Yes,Yes,9.0,5.0,Cash,Yes,No comments,313000.0,
545,tour_1678,ITALY,1-24,Friends/Relatives,,4.0,Volunteering,Wildlife tourism,"Friends, relatives",Independent,No,No,No,No,No,No,No,15.0,0.0,Cash,Yes,No comments,9077320.0,
1261,tour_2547,FRANCE,25-44,Friends/Relatives,,1.0,Leisure and Holidays,Wildlife tourism,"Newspaper, magazines,brochures",Package Tour,Yes,Yes,Yes,Yes,Yes,Yes,Yes,7.0,7.0,Cash,Yes,"Wonderful Country, Landscape, Nature",324300.0,


row 490 and 1261 have total_male=1 but travel_with Friends/Relatives. I suspect that they could be travelling alone. However to clear any doubts, Lets see the average amount spent when someon travels alone. If their total costs falls in the average spent then we will assume they where travelling alone.

In [41]:
df.loc[df.travel_with=='Alone','total_cost'].describe().apply(lambda x: format(x, 'f'))

count        2346.000000
mean      3572909.982046
std       6750682.928440
min         49000.000000
25%        414375.000000
50%       1495875.000000
75%       4309500.000000
max      90085125.000000
Name: total_cost, dtype: object

In [42]:
df.loc[df.travel_with=='Friends/Relatives','total_cost'].describe().apply(lambda x: format(x, 'f'))

count         902.000000
mean     10108597.112255
std      14494908.196549
min         49000.000000
25%       1334899.500000
50%       4972500.000000
75%      11877105.887500
max      99532875.000000
Name: total_cost, dtype: object

In [43]:
px.histogram(df.loc[df.total_cost<=350000],x='travel_with')

1. Ok, so the first line of code showed me that 25% of people who travel alone have an amount of $414 375 or less with a min of $49 000.
2. The category in which the two people are currently recorded in 'Friends/Relatives' have a Q1 of $1 334 899 whis is greater than the total spent in our 2 cases.
3. Plotting a histogram of travellers who spend less than $350 000 shows that most of them travel alone

From all this we can safely deduce that row 490 and 1261 were traveling alone and the total_female is 0.


In [44]:
#First,update Travel with to alone
df.loc[(df.total_female.isna()) & (df.total_male==1),'travel_with']='Alone'



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [45]:
#Second,Update total travellers to 1
df.loc[(df.total_female.isna()) & (df.total_male==1),'Total_travelers']=1

In [46]:
#Third fill NaN in total_female to 0
df.total_female.fillna(0,inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [47]:
df.loc[df.total_female.isna()]

Unnamed: 0,ID,country,age_group,travel_with,total_female,total_male,purpose,main_activity,info_source,tour_arrangement,package_transport_int,package_accomodation,package_food,package_transport_tz,package_sightseeing,package_guided_tour,package_insurance,night_mainland,night_zanzibar,payment_mode,first_trip_tz,most_impressing,total_cost,Total_travelers


In [48]:
df.loc[df.total_male.isna()]

Unnamed: 0,ID,country,age_group,travel_with,total_female,total_male,purpose,main_activity,info_source,tour_arrangement,package_transport_int,package_accomodation,package_food,package_transport_tz,package_sightseeing,package_guided_tour,package_insurance,night_mainland,night_zanzibar,payment_mode,first_trip_tz,most_impressing,total_cost,Total_travelers
1017,tour_2246,UNITED STATES OF AMERICA,45-64,Friends/Relatives,4.0,,Leisure and Holidays,Wildlife tourism,"Travel, agent, tour operator",Package Tour,No,Yes,Yes,Yes,Yes,Yes,No,19.0,0.0,Cash,Yes,,7293000.0,
1473,tour_2810,SPAIN,45-64,Friends/Relatives,7.0,,Other,Hunting tourism,"Travel, agent, tour operator",Package Tour,No,Yes,Yes,Yes,No,No,Yes,11.0,0.0,Cash,No,"Wonderful Country, Landscape, Nature",56960183.0,
2449,tour_3980,SOUTH AFRICA,25-44,Friends/Relatives,0.0,,Meetings and Conference,Wildlife tourism,"Newspaper, magazines,brochures",Independent,No,No,No,No,No,No,No,4.0,0.0,Credit Card,Yes,Friendly People,3978000.0,
3984,tour_5838,CHINA,25-44,Friends/Relatives,10.0,,Leisure and Holidays,Wildlife tourism,"Travel, agent, tour operator",Package Tour,Yes,Yes,Yes,Yes,Yes,Yes,Yes,4.0,2.0,Cash,Yes,Wildlife,89505000.0,


In [49]:
#dropping row 2449 because total_female and	total_male have no values.
df.drop(labels=2449,inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [50]:
df.loc[df.total_male.isna()]

Unnamed: 0,ID,country,age_group,travel_with,total_female,total_male,purpose,main_activity,info_source,tour_arrangement,package_transport_int,package_accomodation,package_food,package_transport_tz,package_sightseeing,package_guided_tour,package_insurance,night_mainland,night_zanzibar,payment_mode,first_trip_tz,most_impressing,total_cost,Total_travelers
1017,tour_2246,UNITED STATES OF AMERICA,45-64,Friends/Relatives,4.0,,Leisure and Holidays,Wildlife tourism,"Travel, agent, tour operator",Package Tour,No,Yes,Yes,Yes,Yes,Yes,No,19.0,0.0,Cash,Yes,,7293000.0,
1473,tour_2810,SPAIN,45-64,Friends/Relatives,7.0,,Other,Hunting tourism,"Travel, agent, tour operator",Package Tour,No,Yes,Yes,Yes,No,No,Yes,11.0,0.0,Cash,No,"Wonderful Country, Landscape, Nature",56960183.0,
3984,tour_5838,CHINA,25-44,Friends/Relatives,10.0,,Leisure and Holidays,Wildlife tourism,"Travel, agent, tour operator",Package Tour,Yes,Yes,Yes,Yes,Yes,Yes,Yes,4.0,2.0,Cash,Yes,Wildlife,89505000.0,


The rest of the group I will Just assume that they are all female Friends/Relatives	 with no male counterpart

In [51]:
#Filling the total_male Nan with 0
df.total_male.fillna(0,inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [52]:
df.loc[df.total_male.isna()]

Unnamed: 0,ID,country,age_group,travel_with,total_female,total_male,purpose,main_activity,info_source,tour_arrangement,package_transport_int,package_accomodation,package_food,package_transport_tz,package_sightseeing,package_guided_tour,package_insurance,night_mainland,night_zanzibar,payment_mode,first_trip_tz,most_impressing,total_cost,Total_travelers


In [53]:
#Now updating the total_travellers column to Remove Nan values
df['Total_travelers']=df.total_female + df.total_male



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [54]:
#All good
df['Total_travelers'].isna().sum()

0

In [55]:
df.columns

Index(['ID', 'country', 'age_group', 'travel_with', 'total_female',
       'total_male', 'purpose', 'main_activity', 'info_source',
       'tour_arrangement', 'package_transport_int', 'package_accomodation',
       'package_food', 'package_transport_tz', 'package_sightseeing',
       'package_guided_tour', 'package_insurance', 'night_mainland',
       'night_zanzibar', 'payment_mode', 'first_trip_tz', 'most_impressing',
       'total_cost', 'Total_travelers'],
      dtype='object')

In [56]:
df.purpose.unique()

array(['Leisure and Holidays', 'Visiting Friends and Relatives',
       'Business', 'Meetings and Conference', 'Volunteering',
       'Scientific and Academic', 'Other'], dtype=object)

In [57]:
df.purpose.isna().sum()

0

In [58]:
df.main_activity.unique()

array(['Wildlife tourism', 'Cultural tourism', 'Mountain climbing',
       'Beach tourism', 'Conference tourism', 'Hunting tourism',
       'Bird watching', 'business', 'Diving and Sport Fishing'],
      dtype=object)

In [59]:
df.main_activity.isna().sum()

0

In [60]:
df.info_source.unique()

array(['Friends, relatives', 'others', 'Travel, agent, tour operator',
       'Radio, TV, Web', 'Tanzania Mission Abroad', 'inflight magazines',
       'Newspaper, magazines,brochures', 'Trade fair'], dtype=object)

In [61]:
df.tour_arrangement.unique()

array(['Independent', 'Package Tour'], dtype=object)

In [62]:
df.package_transport_int.unique()

array(['No', 'Yes'], dtype=object)

In [63]:
df.package_accomodation.unique()

array(['No', 'Yes'], dtype=object)

In [64]:
df.package_guided_tour.unique()

array(['No', 'Yes'], dtype=object)

In [65]:
df.package_insurance.unique()

array(['No', 'Yes'], dtype=object)

In [66]:
df.night_mainland.head()

0    13.0
1    14.0
2     1.0
3    11.0
4     7.0
Name: night_mainland, dtype: float64

In [67]:
df.night_mainland.isna().sum()

0

In [68]:
df.night_mainland.describe()

count    4794.000000
mean        8.493325
std        10.439180
min         0.000000
25%         3.000000
50%         6.000000
75%        11.000000
max       145.000000
Name: night_mainland, dtype: float64

In [69]:
df.loc[df.night_mainland>=100]

Unnamed: 0,ID,country,age_group,travel_with,total_female,total_male,purpose,main_activity,info_source,tour_arrangement,package_transport_int,package_accomodation,package_food,package_transport_tz,package_sightseeing,package_guided_tour,package_insurance,night_mainland,night_zanzibar,payment_mode,first_trip_tz,most_impressing,total_cost,Total_travelers
71,tour_1082,UNITED KINGDOM,45-64,Alone,1.0,0.0,Visiting Friends and Relatives,Beach tourism,"Friends, relatives",Independent,No,No,No,No,No,No,No,120.0,0.0,Cash,No,"Wonderful Country, Landscape, Nature",1422525.0,1.0
1126,tour_2383,GERMANY,1-24,Friends/Relatives,3.0,0.0,Leisure and Holidays,Cultural tourism,"Newspaper, magazines,brochures",Independent,No,No,No,No,No,No,No,130.0,10.0,Cash,Yes,No comments,9945000.0,3.0
4363,tour_628,INDIA,1-24,Alone,0.0,1.0,Business,Beach tourism,Trade fair,Independent,No,No,No,No,No,No,No,145.0,0.0,Cash,No,Good service,1657500.0,1.0


In [70]:
px.box(df,x='night_mainland')

As shown in the boxplot above, there are some outlier values on the number of nights spent on mainland. But I think this can be true because the purpose of travel for these outliers seems to be  valid for visiting friends and or business trip. Also their tour arrangement is Independant which explains why they did not spend so much provided their stay was so long. For now I will assume the data is credible.

In [71]:
df.night_zanzibar.isna().sum()

0

In [72]:
df.night_zanzibar.describe()

count    4794.000000
mean        2.297872
std         4.206208
min         0.000000
25%         0.000000
50%         0.000000
75%         4.000000
max        61.000000
Name: night_zanzibar, dtype: float64

In [73]:
px.box(df,x='night_zanzibar')

In [74]:
df.night_zanzibar.sort_values().unique()

array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  9., 10., 11., 12., 13.,
       14., 15., 16., 17., 19., 20., 21., 22., 23., 24., 25., 26., 27.,
       30., 31., 35., 43., 50., 60., 61.])

In [75]:
df[df.night_zanzibar>=50]

Unnamed: 0,ID,country,age_group,travel_with,total_female,total_male,purpose,main_activity,info_source,tour_arrangement,package_transport_int,package_accomodation,package_food,package_transport_tz,package_sightseeing,package_guided_tour,package_insurance,night_mainland,night_zanzibar,payment_mode,first_trip_tz,most_impressing,total_cost,Total_travelers
442,tour_1543,UNITED KINGDOM,1-24,Spouse,2.0,2.0,Visiting Friends and Relatives,Beach tourism,"Newspaper, magazines,brochures",Independent,No,No,No,No,No,No,No,0.0,60.0,Cash,No,Wildlife,200000.0,4.0
2218,tour_3699,FRANCE,25-44,Alone,0.0,1.0,Leisure and Holidays,Beach tourism,"Friends, relatives",Package Tour,Yes,Yes,Yes,Yes,Yes,Yes,Yes,0.0,50.0,Cash,Yes,No comments,34516509.3,1.0
2225,tour_3705,MALAYSIA,25-44,Alone,0.0,1.0,Visiting Friends and Relatives,Beach tourism,"Friends, relatives",Independent,No,No,No,No,No,No,No,0.0,61.0,Cash,Yes,Friendly People,4200000.0,1.0
3336,tour_5035,CANADA,1-24,Alone,1.0,0.0,Visiting Friends and Relatives,Beach tourism,"Friends, relatives",Independent,No,No,No,No,No,No,No,0.0,60.0,Cash,No,Friendly People,300000.0,1.0


Again the data seems good. Most independant tour arrangements spend less but as for row 2218 their total spent was more which makes sense sinces he spent more days on the Island.

In [76]:
df[['package_transport_tz','package_sightseeing']].isna().sum()

package_transport_tz    0
package_sightseeing     0
dtype: int64

In [77]:
df[['package_transport_tz','package_sightseeing']].head()

Unnamed: 0,package_transport_tz,package_sightseeing
0,No,No
1,No,No
2,No,No
3,Yes,Yes
4,No,No


In [78]:
df.payment_mode.isna().sum()

0

In [79]:
df.payment_mode.unique()

array(['Cash', 'Credit Card', 'Other', 'Travellers Cheque'], dtype=object)

In [80]:
df.first_trip_tz.isna().sum()

0

In [81]:
df.first_trip_tz.unique()

array(['No', 'Yes'], dtype=object)

In [82]:
df.most_impressing.isna().sum()

313

In [83]:
df.most_impressing.head()

0                         Friendly People
1    Wonderful Country, Landscape, Nature
2                    Excellent Experience
3                         Friendly People
4                             No comments
Name: most_impressing, dtype: object

In [84]:
df.most_impressing.unique()

array(['Friendly People', 'Wonderful Country, Landscape, Nature',
       'Excellent Experience', 'No comments', ' Wildlife', nan,
       'Good service', 'Satisfies and Hope Come Back'], dtype=object)

In [85]:
df.loc[df.most_impressing.isna()]

Unnamed: 0,ID,country,age_group,travel_with,total_female,total_male,purpose,main_activity,info_source,tour_arrangement,package_transport_int,package_accomodation,package_food,package_transport_tz,package_sightseeing,package_guided_tour,package_insurance,night_mainland,night_zanzibar,payment_mode,first_trip_tz,most_impressing,total_cost,Total_travelers
8,tour_101,NIGERIA,25-44,Alone,0.0,1.0,Leisure and Holidays,Cultural tourism,"Travel, agent, tour operator",Independent,No,No,No,No,No,No,No,4.0,0.0,Cash,Yes,,994500.0,1.0
49,tour_1058,UAE,25-44,Alone,0.0,1.0,Visiting Friends and Relatives,Beach tourism,"Friends, relatives",Independent,No,No,No,No,No,No,No,7.0,1.0,Cash,No,,1657500.0,1.0
56,tour_1066,FRANCE,65+,Spouse,1.0,1.0,Visiting Friends and Relatives,Beach tourism,"Friends, relatives",Independent,No,No,No,No,No,No,No,56.0,0.0,Cash,Yes,,2269330.0,2.0
64,tour_1074,FRANCE,25-44,Alone,0.0,1.0,Leisure and Holidays,Cultural tourism,"Friends, relatives",Independent,No,No,No,No,No,No,No,3.0,0.0,Cash,Yes,,400000.0,1.0
82,tour_1098,SOUTH AFRICA,25-44,Friends/Relatives,0.0,2.0,Meetings and Conference,Mountain climbing,"Friends, relatives",Independent,No,No,No,No,No,No,No,1.0,0.0,Cash,No,,414375.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4690,tour_849,UGANDA,1-24,Spouse,1.0,0.0,Visiting Friends and Relatives,Beach tourism,"Friends, relatives",Independent,No,No,No,No,No,No,No,5.0,0.0,Cash,No,,663000.0,1.0
4691,tour_85,GERMANY,45-64,Alone,1.0,0.0,Visiting Friends and Relatives,Beach tourism,"Friends, relatives",Independent,No,No,No,No,No,No,No,5.0,0.0,Cash,Yes,,1657500.0,1.0
4723,tour_889,NETHERLANDS,45-64,Friends/Relatives,2.0,0.0,Visiting Friends and Relatives,Wildlife tourism,"Friends, relatives",Independent,No,No,No,No,No,No,No,10.0,0.0,Credit Card,No,,2486250.0,2.0
4733,tour_902,BURUNDI,25-44,Alone,1.0,0.0,Meetings and Conference,Mountain climbing,"Friends, relatives",Package Tour,Yes,Yes,Yes,No,No,No,No,13.0,0.0,Cash,No,,5967000.0,1.0


In [86]:
px.histogram(df,x='most_impressing')

Since there is a field for no comments, I will fill the Nan values with No comments

In [87]:
df.most_impressing.fillna('No comments',inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [88]:
df.most_impressing.isna().sum()

0

In [89]:
df.total_cost.isna().sum()

0

In [90]:
df.total_cost.describe().apply(lambda x:format(x,'f'))

count        4794.000000
mean      8097148.065408
std      12206946.389901
min         49000.000000
25%        809025.750000
50%       3357182.500000
75%       9849693.750000
max      99532875.000000
Name: total_cost, dtype: object

In [91]:
px.box(df,x='total_cost')

In [92]:
df.loc[df.total_cost<=60000]

Unnamed: 0,ID,country,age_group,travel_with,total_female,total_male,purpose,main_activity,info_source,tour_arrangement,package_transport_int,package_accomodation,package_food,package_transport_tz,package_sightseeing,package_guided_tour,package_insurance,night_mainland,night_zanzibar,payment_mode,first_trip_tz,most_impressing,total_cost,Total_travelers
140,tour_117,KENYA,25-44,Alone,0.0,1.0,Leisure and Holidays,Beach tourism,"Friends, relatives",Independent,No,No,No,No,No,No,No,2.0,0.0,Cash,No,Friendly People,56000.0,1.0
295,tour_1367,KENYA,25-44,Spouse and Children,1.0,1.0,Leisure and Holidays,Hunting tourism,"Travel, agent, tour operator",Independent,No,No,No,No,No,No,No,3.0,0.0,Cash,No,Friendly People,60000.0,2.0
298,tour_137,UNITED STATES OF AMERICA,65+,Spouse,1.0,1.0,Leisure and Holidays,Beach tourism,"Friends, relatives",Independent,No,No,No,No,No,No,No,3.0,0.0,Cash,Yes,Friendly People,50000.0,2.0
522,tour_1647,UNITED KINGDOM,25-44,Friends/Relatives,2.0,0.0,Leisure and Holidays,Cultural tourism,"Friends, relatives",Independent,No,No,No,No,No,No,No,2.0,4.0,Cash,Yes,Excellent Experience,52030.0,2.0
547,tour_168,INDIA,25-44,Spouse,1.0,1.0,Leisure and Holidays,Wildlife tourism,"Travel, agent, tour operator",Package Tour,No,Yes,Yes,Yes,Yes,No,No,6.0,2.0,Cash,Yes,Wildlife,53300.0,2.0
608,tour_1748,UNITED KINGDOM,1-24,Alone,1.0,0.0,Volunteering,Conference tourism,others,Independent,No,No,No,No,No,No,No,27.0,0.0,Cash,Yes,Friendly People,50000.0,1.0
1096,tour_2348,UNITED KINGDOM,1-24,Alone,1.0,0.0,Volunteering,Cultural tourism,Tanzania Mission Abroad,Independent,No,No,No,No,No,No,No,13.0,3.0,Cash,Yes,"Wonderful Country, Landscape, Nature",56901.0,1.0
1097,tour_2349,UNITED KINGDOM,1-24,Alone,1.0,0.0,Volunteering,Cultural tourism,Tanzania Mission Abroad,Independent,No,No,No,No,No,No,No,13.0,3.0,Cash,Yes,Wildlife,56901.0,1.0
1122,tour_238,UNITED KINGDOM,1-24,Friends/Relatives,1.0,1.0,Leisure and Holidays,Beach tourism,"Friends, relatives",Independent,No,No,No,No,No,No,No,0.0,7.0,Cash,Yes,Friendly People,60000.0,2.0
1472,tour_281,KENYA,25-44,Alone,1.0,0.0,Visiting Friends and Relatives,Cultural tourism,"Friends, relatives",Independent,No,No,No,No,No,No,No,30.0,0.0,Cash,No,"Wonderful Country, Landscape, Nature",50000.0,1.0


In [93]:
df.to_csv('clean_data.csv')