# Tourism Data Cleaning and Analysis

This notebook contains functions for cleaning and analyzing tourism data from Tanzania. The dataset includes information about tourists, their demographics, travel arrangements, and expenditures.

## Variable Definitions
- **id**: Unique identifier for each tourist
- **country**: The country a tourist coming from
- **age_group**: The age group of a tourist
- **travel_with**: The relation of people a tourist travel with to Tanzania
- **total_female/total_male**: Total number of females/males
- **purpose**: The purpose of visiting Tanzania
- **main_activity**: The main activity of tourism in Tanzania
- **night_mainland/night_zanzibar**: Number of nights spent in each location
- **total_cost**: The total tourist expenditure in TZS(currency)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
## hello world
# Set style for better plots
plt.style.use('default')
sns.set_palette("husl")

def load_tourism_data():
    """Load the tourism dataset files"""
    try:
        train_df = pd.read_csv(r'c:\Users\user\Downloads\college_stuff\data analytics\Train.csv')
        test_df = pd.read_csv(r'c:\Users\user\Downloads\college_stuff\data analytics\Test.csv')
        sample_submission = pd.read_csv(r'c:\Users\user\Downloads\college_stuff\data analytics\SampleSubmission.csv')
        
        print("Data loaded successfully!")
        print(f"Train data shape: {train_df.shape}")
        print(f"Test data shape: {test_df.shape}")
        print(f"Sample submission shape: {sample_submission.shape}")
        
        return train_df, test_df, sample_submission
    except FileNotFoundError as e:
        print(f"Error loading files: {e}")
        return None, None, None

def explore_dataset(df, dataset_name="Dataset"):
    """Comprehensive data exploration"""
    print(f"\n=== {dataset_name} Exploration ===")
    print(f"Shape: {df.shape}")
    print(f"\nColumns: {list(df.columns)}")
    print(f"\nData Types:\n{df.dtypes}")
    print(f"\nMissing Values:\n{df.isnull().sum()}")
    print(f"\nDuplicate Rows: {df.duplicated().sum()}")
    
    return df.describe()

## call the functions 

train_df, test_df, sample_submission = load_tourism_data()

if train_df is not None:
    explore_dataset(train_df, "Train Dataset")

if test_df is not None:
    explore_dataset(test_df, "Test Dataset")

if sample_submission is not None:
    explore_dataset(sample_submission, "Sample Submission Dataset")



Data loaded successfully!
Train data shape: (4809, 23)
Test data shape: (1601, 22)
Sample submission shape: (1601, 2)

=== Train Dataset Exploration ===
Shape: (4809, 23)

Columns: ['ID', 'country', 'age_group', 'travel_with', 'total_female', 'total_male', 'purpose', 'main_activity', 'info_source', 'tour_arrangement', 'package_transport_int', 'package_accomodation', 'package_food', 'package_transport_tz', 'package_sightseeing', 'package_guided_tour', 'package_insurance', 'night_mainland', 'night_zanzibar', 'payment_mode', 'first_trip_tz', 'most_impressing', 'total_cost']

Data Types:
ID                        object
country                   object
age_group                 object
travel_with               object
total_female             float64
total_male               float64
purpose                   object
main_activity             object
info_source               object
tour_arrangement          object
package_transport_int     object
package_accomodation      object
package_food 

In [None]:
def clean_tourism_data(df):
    """Specific data cleaning for tourism dataset"""
    df_clean = df.copy()
    
    # Display initial info
    print("=== Initial Data Info ===")
    print(f"Shape before cleaning: {df_clean.shape}")
    print(f"Missing values:\n{df_clean.isnull().sum()}")
    
    # Remove rows with missing target variable (total_cost) if it exists
    if 'total_cost' in df_clean.columns:
        initial_rows = len(df_clean)
        df_clean = df_clean.dropna(subset=['total_cost'])
        print(f"Removed {initial_rows - len(df_clean)} rows with missing total_cost")
    
    # Handle missing values in categorical columns
    categorical_cols = df_clean.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        if df_clean[col].isnull().sum() > 0:
            mode_value = df_clean[col].mode().iloc[0] if len(df_clean[col].mode()) > 0 else 'Unknown'
            df_clean[col].fillna(mode_value, inplace=True)
            print(f"Filled missing values in {col} with mode: {mode_value}")
    
    # Handle missing values in numerical columns
    numerical_cols = df_clean.select_dtypes(include=[np.number]).columns
    for col in numerical_cols:
        if df_clean[col].isnull().sum() > 0:
            median_value = df_clean[col].median()
            df_clean[col].fillna(median_value, inplace=True)
            print(f"Filled missing values in {col} with median: {median_value}")
    
    # Remove duplicates
    initial_rows = len(df_clean)
    df_clean = df_clean.drop_duplicates()
    print(f"Removed {initial_rows - len(df_clean)} duplicate rows")
    
    print(f"\nShape after cleaning: {df_clean.shape}")
    return df_clean

def standardize_tourism_data(df):
    """Standardize text columns in tourism data"""
    df_std = df.copy()
    
    text_columns = ['country', 'travel_with', 'purpose', 'main_activity', 
                   'infor_source', 'tour_arrangment', 'payment_mode', 'most_impressing']
    
    for col in text_columns:
        if col in df_std.columns:
            df_std[col] = df_std[col].astype(str).str.strip().str.lower()
    
    return df_std

In [None]:
def plot_missing_values(df, title="Missing Values Analysis"):
    """Visualize missing values in the dataset"""
    missing_data = df.isnull().sum()
    missing_percent = (missing_data / len(df)) * 100
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Missing values count
    missing_data[missing_data > 0].plot(kind='bar', ax=ax1)
    ax1.set_title('Missing Values Count')
    ax1.set_ylabel('Count')
    ax1.tick_params(axis='x', rotation=45)
    
    # Missing values percentage
    missing_percent[missing_percent > 0].plot(kind='bar', ax=ax2, color='orange')
    ax2.set_title('Missing Values Percentage')
    ax2.set_ylabel('Percentage (%)')
    ax2.tick_params(axis='x', rotation=45)
    
    plt.suptitle(title)
    plt.tight_layout()
    plt.show()

def plot_categorical_distribution(df, columns, max_categories=10):
    """Plot distribution of categorical variables"""
    categorical_cols = [col for col in columns if col in df.columns]
    
    if not categorical_cols:
        print("No categorical columns found")
        return
    
    n_cols = min(3, len(categorical_cols))
    n_rows = (len(categorical_cols) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
    if n_rows == 1 and n_cols == 1:
        axes = [axes]
    elif n_rows == 1:
        axes = axes
    else:
        axes = axes.flatten()
    
    for i, col in enumerate(categorical_cols):
        if i < len(axes):
            value_counts = df[col].value_counts().head(max_categories)
            value_counts.plot(kind='bar', ax=axes[i])
            axes[i].set_title(f'Distribution of {col}')
            axes[i].tick_params(axis='x', rotation=45)
    
    # Hide empty subplots
    for i in range(len(categorical_cols), len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    plt.show()

def plot_numerical_distribution(df, columns):
    """Plot distribution of numerical variables"""
    numerical_cols = [col for col in columns if col in df.columns and df[col].dtype in ['int64', 'float64']]
    
    if not numerical_cols:
        print("No numerical columns found")
        return
    
    n_cols = min(3, len(numerical_cols))
    n_rows = (len(numerical_cols) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
    if n_rows == 1 and n_cols == 1:
        axes = [axes]
    elif n_rows == 1:
        axes = axes
    else:
        axes = axes.flatten()
    
    for i, col in enumerate(numerical_cols):
        if i < len(axes):
            df[col].hist(bins=30, ax=axes[i])
            axes[i].set_title(f'Distribution of {col}')
            axes[i].set_xlabel(col)
            axes[i].set_ylabel('Frequency')
    
    # Hide empty subplots
    for i in range(len(numerical_cols), len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    plt.show()

In [None]:
def create_interactive_plots(df):
    """Create interactive plots using Plotly"""
    
    # Country distribution
    if 'country' in df.columns:
        country_counts = df['country'].value_counts().head(15)
        fig1 = px.bar(x=country_counts.values, y=country_counts.index, 
                     orientation='h', title='Top 15 Countries by Tourist Count')
        fig1.show()
    
    # Age group vs Total cost
    if 'age_group' in df.columns and 'total_cost' in df.columns:
        fig2 = px.box(df, x='age_group', y='total_cost', 
                     title='Total Cost Distribution by Age Group')
        fig2.show()
    
    # Purpose distribution
    if 'purpose' in df.columns:
        purpose_counts = df['purpose'].value_counts()
        fig3 = px.pie(values=purpose_counts.values, names=purpose_counts.index,
                     title='Purpose of Visit Distribution')
        fig3.show()

def correlation_analysis(df):
    """Analyze correlations between numerical variables"""
    numerical_cols = df.select_dtypes(include=[np.number]).columns
    
    if len(numerical_cols) > 1:
        correlation_matrix = df[numerical_cols].corr()
        
        plt.figure(figsize=(12, 8))
        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
        plt.title('Correlation Matrix of Numerical Variables')
        plt.tight_layout()
        plt.show()
        
        return correlation_matrix
    else:
        print("Not enough numerical columns for correlation analysis")
        return None

In [None]:
# Main workflow - Execute this cell to run the complete analysis
def run_complete_analysis():
    """Run the complete data cleaning and analysis workflow"""
    
    print("🚀 Starting Tourism Data Analysis...")
    
    # Load data
    train_df, test_df, sample_submission = load_tourism_data()
    
    if train_df is not None:
        # Explore training data
        print("\n" + "="*50)
        print("📊 EXPLORING TRAINING DATA")
        print("="*50)
        train_summary = explore_dataset(train_df, "Training Data")
        
        # Plot missing values
        plot_missing_values(train_df, "Training Data - Missing Values")
        
        # Clean the data
        print("\n" + "="*50)
        print("🧹 CLEANING DATA")
        print("="*50)
        train_clean = clean_tourism_data(train_df)
        train_clean = standardize_tourism_data(train_clean)
        
        # Visualize data distributions
        print("\n" + "="*50)
        print("📈 DATA VISUALIZATION")
        print("="*50)
        
        # Categorical distributions
        categorical_columns = ['country', 'age_group', 'purpose', 'main_activity', 'travel_with']
        plot_categorical_distribution(train_clean, categorical_columns)
        
        # Numerical distributions
        numerical_columns = ['total_female', 'total_male', 'night_mainland', 'night_zanzibar', 'total_cost']
        plot_numerical_distribution(train_clean, numerical_columns)
        
        # Interactive plots
        print("Creating interactive plots...")
        create_interactive_plots(train_clean)
        
        # Correlation analysis
        print("\n" + "="*50)
        print("🔗 CORRELATION ANALYSIS")
        print("="*50)
        correlation_matrix = correlation_analysis(train_clean)
        
        print("\n✅ Analysis Complete!")
        return train_clean, test_df, sample_submission
    
    else:
        print("❌ Could not load data files. Please check file paths.")
        return None, None, None

# Uncomment the line below to run the complete analysis
# train_clean, test_df, sample_submission = run_complete_analysis()