<a href="https://colab.research.google.com/github/abhirajbhattashali/EuroSeasonVoyage/blob/main/EuroSeasonVoyage.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

from google.colab import files
uploaded = files.upload()

In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [None]:
!pip install chardet



In [8]:
#Determing the encoding of csv file
import chardet

with open('destinations.csv', 'rb') as file:
    result = chardet.detect(file.read())
    print(result)

{'encoding': 'MacRoman', 'confidence': 0.7219392133818022, 'language': ''}


In [9]:
#Loading the data and EDA
data = pd.read_csv('destinations.csv', encoding='MacRoman')
data.head(2)



Unnamed: 0,Destination,Region,Country,Category,Latitude,Longitude,Approximate Annual Tourists,Currency,Majority Religion,Famous Foods,Language,Best Time to Visit,Cost of Living,Safety,Cultural Significance,Description
0,Rome,Lazio,Italy,City,41.902782,12.496366,14 million,Euro,Roman Catholic,"Pizza, Pasta, Gelato",Italian,Spring (April-May) or Fall (Sept-Oct),Medium-high,"Generally safe, but watch out for pickpockets","The capital city, known for its historical lan...","A hub of ancient history and modern culture, w..."
1,Florence,Tuscany,Italy,City,43.769581,11.255772,10 million,Euro,Roman Catholic,"Pizza, Pasta, Gelato",Italian,Spring (April-May) or Fall (Sept-Oct),Medium-high,"Generally safe, but watch out for pickpockets","A Renaissance city famous for its art, archite...","Home to world-class museums, including the Uff..."


In [10]:
# Removing unecessary columns Cultural Significance , Safety and Description

data = data.drop(['Cultural Significance', 'Description',"Safety"], axis=1)
data.columns

Index(['Destination', 'Region', 'Country', 'Category', 'Latitude', 'Longitude',
       'Approximate Annual Tourists', 'Currency', 'Majority Religion',
       'Famous Foods', 'Language', 'Best Time to Visit', 'Cost of Living'],
      dtype='object')

In [11]:
#null values in dataset

print(data.isnull().sum())
data.drop_duplicates(inplace=True)



Destination                    0
Region                         0
Country                        0
Category                       0
Latitude                       0
Longitude                      0
Approximate Annual Tourists    0
Currency                       0
Majority Religion              0
Famous Foods                   0
Language                       0
Best Time to Visit             0
Cost of Living                 0
dtype: int64


In [12]:
languages = data["Language"].unique()
print(languages)
print(len(languages))

['Italian' 'Spanish' 'Spanish (and Basque)' 'French' 'French (and German)'
 'German' 'Dutch, French' 'Danish' 'French, MonÇgasque' 'Russian'
 'Russian, Tatar' 'Greek' 'Portuguese' 'Norwegian' 'Swedish'
 'German, French, Italian' 'French, German' 'German, French' 'Turkish'
 'Ukrainian' 'English' 'English, Scottish Gaelic'
 'Luxembourgish, French, German' 'Maltese, English' 'Icelandic' 'Serbian'
 'Serbian, Hungarian']
27


In [13]:
countries = data["Country"].unique()
print(countries)
print(len(countries))

['Italy' 'Spain' 'France' 'Austria' 'Belgium' 'Denmark' 'Germany' 'Monaco'
 'Russia' 'Greece' 'Portugal' 'Norway' 'Sweden' 'Switzerland' 'Turkey'
 'Ukraine' 'United Kingdom' 'Luxembourg' 'Malta' 'Iceland' 'Serbia']
21


In [14]:
destinations = data["Destination"].unique()
print(len(destinations))

208


In [15]:
# Feature Engineering
data['Currency'] = data['Currency'].replace('Euro (EUR)', 'Euro')

In [16]:
print(data["Currency"].unique())

['Euro' 'Danish krone (DKK)' 'Russian Ruble (RUB)' 'Norwegian krone (NOK)'
 'Swedish krona (SEK)' 'Swiss franc (CHF)' 'Turkish lira (TRY)'
 'Ukrainian hryvnia (UAH)' 'British Pound Sterling (GBP)'
 'Icelandic kr¢na (ISK)' 'Serbian dinar (RSD)']


In [17]:
data.shape

(209, 13)

In [18]:
#Visualizing the data
import plotly.express as px

fig = px.pie(data,names="Country",title="European Countries Distribution")
fig.show()


In [19]:
#Cost of Living Distibution by percentage
fig=px.pie(data,names="Cost of Living",title="Percentage Cost of Living Distribution")
fig.update_traces(textposition="inside",textinfo="percent+label")
fig.show()

In [22]:
# Country wise Cost of Living Distribution in Europe

import plotly.express as px

fig = px.box(data, x="Country", y="Cost of Living", title="Cost of Living Distribution by Country")
fig.show()


In [21]:
data.groupby('Cost of Living').size()

Unnamed: 0_level_0,0
Cost of Living,Unnamed: 1_level_1
Extremely high,4
Free,10
High,15
Medium,72
Medium-high,107
Varies,1


In [23]:
# Bar graph on groupby cost of living usig plotly

import plotly.graph_objects as go

cost_of_living_counts = data.groupby('Cost of Living').size().reset_index(name='Count')

fig = go.Figure(data=[go.Bar(
    x=cost_of_living_counts['Cost of Living'],
    y=cost_of_living_counts['Count'],
    marker_color=px.colors.qualitative.Plotly
)])

fig.update_layout(
    title="Cost of Living Distribution by Count",
    xaxis_title="Cost of Living",
    yaxis_title="Count"
)

fig.show()


In [24]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209 entries, 0 to 208
Data columns (total 13 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Destination                  209 non-null    object 
 1   Region                       209 non-null    object 
 2   Country                      209 non-null    object 
 3   Category                     209 non-null    object 
 4   Latitude                     209 non-null    float64
 5   Longitude                    209 non-null    float64
 6   Approximate Annual Tourists  209 non-null    object 
 7   Currency                     209 non-null    object 
 8   Majority Religion            209 non-null    object 
 9   Famous Foods                 209 non-null    object 
 10  Language                     209 non-null    object 
 11  Best Time to Visit           209 non-null    object 
 12  Cost of Living               209 non-null    object 
dtypes: float64(2), objec

In [25]:
data["Best Time to Visit"].unique()

array(['Spring (April-May) or Fall (Sept-Oct)', 'Summer (June-September)',
       'Winter (Dec-Mar) for skiing, Summer (Jun-Sept)',
       'Summer (Jun-Sept)', 'Summer (June-August)', 'Year-round',
       'Spring (May-June) or Fall (Sept-Oct)',
       'Winter (Dec-Mar) for Northern Lights, Summer (Jun-Aug) for hiking',
       'Summer (Jun-Aug)', 'Spring (Apr-May) or Fall (Sep-Oct)',
       'Winter (Dec-Mar) for skiing, Summer (Jun-Sep) for hiking',
       'Spring (May-June) or Fall (Sep-Oct)',
       'Spring (April-May) or Fall (Sep-Oct)',
       'Winter (December-March) for skiing, Summer (June-August) for hiking'],
      dtype=object)

In [26]:
data.head()

Unnamed: 0,Destination,Region,Country,Category,Latitude,Longitude,Approximate Annual Tourists,Currency,Majority Religion,Famous Foods,Language,Best Time to Visit,Cost of Living
0,Rome,Lazio,Italy,City,41.902782,12.496366,14 million,Euro,Roman Catholic,"Pizza, Pasta, Gelato",Italian,Spring (April-May) or Fall (Sept-Oct),Medium-high
1,Florence,Tuscany,Italy,City,43.769581,11.255772,10 million,Euro,Roman Catholic,"Pizza, Pasta, Gelato",Italian,Spring (April-May) or Fall (Sept-Oct),Medium-high
2,Venice,Veneto,Italy,City,45.435559,12.336196,10 million,Euro,Roman Catholic,"Pizza, Pasta, Gelato",Italian,Spring (April-May) or Fall (Sept-Oct),Medium-high
3,Milan,Lombardy,Italy,City,45.464643,9.18854,7 million,Euro,Roman Catholic,"Risotto, Ossobuco, Panettone",Italian,Spring (April-May) or Fall (Sept-Oct),High
4,Naples,Campania,Italy,City,40.85133,14.25472,5 million,Euro,Roman Catholic,"Pizza, Pasta, Cannoli",Italian,Spring (April-May) or Fall (Sept-Oct),Medium


In [27]:
print(data.columns
      )

Index(['Destination', 'Region', 'Country', 'Category', 'Latitude', 'Longitude',
       'Approximate Annual Tourists', 'Currency', 'Majority Religion',
       'Famous Foods', 'Language', 'Best Time to Visit', 'Cost of Living'],
      dtype='object')


In [28]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209 entries, 0 to 208
Data columns (total 13 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Destination                  209 non-null    object 
 1   Region                       209 non-null    object 
 2   Country                      209 non-null    object 
 3   Category                     209 non-null    object 
 4   Latitude                     209 non-null    float64
 5   Longitude                    209 non-null    float64
 6   Approximate Annual Tourists  209 non-null    object 
 7   Currency                     209 non-null    object 
 8   Majority Religion            209 non-null    object 
 9   Famous Foods                 209 non-null    object 
 10  Language                     209 non-null    object 
 11  Best Time to Visit           209 non-null    object 
 12  Cost of Living               209 non-null    object 
dtypes: float64(2), objec

In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
from tensorflow import keras
from tensorflow.keras import layers


data_copy = data.copy()
data_copy.fillna('Unknown', inplace=True)

def extract_seasons(best_time):
    best_time = best_time.lower()
    seasons = []
    if 'spring' in best_time:
        seasons.append('Spring')
    if 'summer' in best_time:
        seasons.append('Summer')
    if 'fall' in best_time or 'autumn' in best_time:
        seasons.append('Fall')
    if 'winter' in best_time:
        seasons.append('Winter')
    if 'year-round' in best_time:
        seasons.append('Year-round')
    return seasons if seasons else ['Unknown']


data_copy['Seasons'] = data_copy['Best Time to Visit'].apply(extract_seasons)
data_copy = data_copy.explode('Seasons')
data_copy.drop(columns=['Best Time to Visit'], inplace=True)
print(data_copy['Seasons'].unique())
print(data_copy.head(3))

['Spring' 'Fall' 'Summer' 'Winter' 'Year-round']
  Destination   Region Country Category   Latitude  Longitude  \
0        Rome    Lazio   Italy     City  41.902782  12.496366   
0        Rome    Lazio   Italy     City  41.902782  12.496366   
1    Florence  Tuscany   Italy     City  43.769581  11.255772   

  Approximate Annual Tourists Currency Majority Religion  \
0                  14 million     Euro    Roman Catholic   
0                  14 million     Euro    Roman Catholic   
1                  10 million     Euro    Roman Catholic   

           Famous Foods Language Cost of Living Seasons  
0  Pizza, Pasta, Gelato  Italian    Medium-high  Spring  
0  Pizza, Pasta, Gelato  Italian    Medium-high    Fall  
1  Pizza, Pasta, Gelato  Italian    Medium-high  Spring  


In [34]:
categorical_cols = ['Region', 'Currency', 'Majority Religion', 'Famous Foods', 'Language']
data_copy = pd.get_dummies(data_copy, columns=categorical_cols, drop_first=True)


le = LabelEncoder()
data_copy['Cost of Living'] = le.fit_transform(data_copy['Cost of Living'])

X = data_copy.drop(columns=['Seasons', 'Approximate Annual Tourists', 'Destination'])
y = data_copy['Country']

X = X.apply(pd.to_numeric, errors='coerce')


if X.isnull().values.any():
    print("NaN values found in features after conversion. Filling NaNs with 0.")
    X.fillna(0, inplace=True)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


numeric_cols = ['Latitude', 'Longitude', 'Cost of Living']
ct = ColumnTransformer([("numeric", StandardScaler(), numeric_cols)], remainder='passthrough')

# Feature scaling
X_train = ct.fit_transform(X_train)
X_train = X_train.astype(np.float32)
X_test = ct.transform(X_test)
X_test = X_test.astype(np.float32)

# One-hot encode the target variable (Countries)
encoder = OneHotEncoder(sparse_output=False)
y_train_encoded = encoder.fit_transform(y_train.values.reshape(-1, 1))
y_test_encoded = encoder.transform(y_test.values.reshape(-1, 1))

# Model Architechture
model = keras.Sequential([
    layers.Input(shape=(X_train.shape[1],)),
    layers.Dense(128, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(y_train_encoded.shape[1], activation='softmax')  # Number of classes in output layer
])

# Compilation
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Training
model.fit(X_train, y_train_encoded, epochs=30, batch_size=32, validation_split=0.2)


# Evaluation
loss, accuracy = model.evaluate(X_test, y_test_encoded)
print("Test Accuracy:", accuracy)



y_pred_prob = model.predict(X_test)
y_pred = np.argmax(y_pred_prob, axis=1)
predicted_labels = encoder.inverse_transform(y_pred_prob)

print("Classification Report:")
print(classification_report(y_test, predicted_labels.flatten(),zero_division=0))


NaN values found in features after conversion. Filling NaNs with 0.
Epoch 1/30
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 54ms/step - accuracy: 0.1110 - loss: 2.9938 - val_accuracy: 0.2097 - val_loss: 2.9160
Epoch 2/30
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.4905 - loss: 2.8013 - val_accuracy: 0.6129 - val_loss: 2.7549
Epoch 3/30
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.7652 - loss: 2.6118 - val_accuracy: 0.6774 - val_loss: 2.5643
Epoch 4/30
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.8323 - loss: 2.3590 - val_accuracy: 0.7097 - val_loss: 2.3447
Epoch 5/30
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.9035 - loss: 2.0810 - val_accuracy: 0.7581 - val_loss: 2.0875
Epoch 6/30
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.9208 - loss: 1.7739 - val_accuracy: 0.82

In [46]:
import pandas as pd
import numpy as np
def predict_best_countries(season):
  season_data = data_copy[data_copy['Seasons'] == season]
  if season_data.empty:
    return "No destinations found for the specified season"


  X_season = season_data.drop(columns=['Seasons', 'Approximate Annual Tourists', 'Destination'])
  X_season = X_season.apply(pd.to_numeric, errors='coerce')
  X_season.fillna(0, inplace=True)
  X_season = ct.transform(X_season)
  X_season = X_season.astype(np.float32)

  # Prediction
  y_pred_prob_season = model.predict(X_season,verbose=0)
  y_pred_season = np.argmax(y_pred_prob_season, axis=1)

  # Getting the predicted class indices
  y_pred_season = np.argmax(y_pred_prob_season, axis=1)

  # Converting predictions back to original labels
  predicted_labels_season = encoder.inverse_transform(y_pred_prob_season)
  return predicted_labels_season.flatten()


season_to_predict = "Summer"
predicted_countries = predict_best_countries(season_to_predict)
countries = pd.Series(predicted_countries).unique()

print(f"Predicted {len(countries)} countries to visit during {season_to_predict}:")
for country in countries:
  print(country)


Predicted 13 countries to visit during Summer:
Spain
France
Austria
Denmark
Monaco
Russia
Norway
Sweden
Switzerland
Ukraine
Malta
Iceland
Serbia
