In [20]:
import pandas as pd
import numpy as np
import os

In [21]:
base_path = 'TA_DS'
clean_path = 'TA_DS_clean'

In [3]:
output_path = 'summary.txt'
with open(output_path, 'w') as f:
    for file_name in os.listdir(clean_path):
        if file_name.endswith('.csv'):
            file_path = os.path.join(clean_path, file_name)
            df = pd.read_csv(file_path)

            q1_rating = df['rating'].quantile(0.25)
            q1_user_ratings_total = df['user_ratings_total'].quantile(0.25)

            above_q1_rating = (df['rating'] > q1_rating).sum()
            below_q1_rating = (df['rating'] < q1_rating).sum()

            above_q1_user_ratings_total = (df['user_ratings_total'] > q1_user_ratings_total).sum()
            below_q1_user_ratings_total = (df['user_ratings_total'] < q1_user_ratings_total).sum()

            f.write(f"Processing {file_path}\n")
            f.write(f"Data points above first quartile rating: {above_q1_rating}\n")
            f.write(f"Data points below first quartile rating: {below_q1_rating}\n")
            f.write(f"Data points above first quartile user ratings total: {above_q1_user_ratings_total}\n")
            f.write(f"Data points below first quartile user ratings total: {below_q1_user_ratings_total}\n\n")


In [4]:
with open('count.txt', 'w') as count_file:
    for file_name in os.listdir(clean_path):
        if file_name.endswith('.csv'):
            file_path = os.path.join(clean_path, file_name)
            df = pd.read_csv(file_path)

            # Log the initial number of entries
            initial_count = df.shape[0]

            # Calculate first quartiles
            q1_rating = df['rating'].quantile(0.25)
            q1_user_ratings_total = df['user_ratings_total'].quantile(0.25)

            # Apply filtering based on first quartile values
            filtered_df = df[(df['rating'] >= q1_rating) & (df['user_ratings_total'] >= q1_user_ratings_total)]
            filtered_count = filtered_df.shape[0]

            # Write the counts of original and filtered data to the count.txt file
            count_file.write(f"{file_name}: Before filtering = {initial_count}, After filtering = {filtered_count}\n")

            # Overwrite the original file with the filtered data
            filtered_df.to_csv(file_path, index=False)

    print("Processed all files and saved counts in count.txt")

Processed all files and saved counts in count.txt


In [5]:
with open('count2.txt', 'w') as count_file:
    for file_name in os.listdir(clean_path):
        if file_name.endswith('.csv'):
            file_path = os.path.join(clean_path, file_name)
            df = pd.read_csv(file_path)

            # Log the initial number of entries
            initial_count = df.shape[0]

            # Filter entries where user_ratings_total is not 0
            filtered_df = df[df['user_ratings_total'] != 0]
            filtered_count = filtered_df.shape[0]

            # Write the counts of original and filtered data to the count2.txt file
            count_file.write(f"{file_name}: Before filtering = {initial_count}, After filtering = {filtered_count}\n")

            # Overwrite the original file with the filtered data
            filtered_df.to_csv(file_path, index=False)

    print("Processed all files and saved counts in count2.txt")

Processed all files and saved counts in count2.txt


In [9]:
new_columns = [
    'history_and_cultural', 
    'nature_and_adventure', 
    'entertainment_and_leisure', 
    'spiritual_and_architecture', 
    'wildlife_and_conservation'
]

for file_name in os.listdir(clean_path):
    if file_name.endswith('.csv'):
        file_path = os.path.join(clean_path, file_name)
        df = pd.read_csv(file_path)
        
        for column in new_columns:
            df[column] = 0 
        
        df.to_csv(file_path, index=False)        

In [23]:
for file_name in os.listdir(clean_path):
    if file_name.endswith('.csv'):
        file_path = os.path.join(clean_path, file_name)
        df = pd.read_csv(file_path)
        
        df['history_and_cultural'] = np.where(
            df['name'].str.lower().str.contains('museum|monumen|patung|statue|benteng|fort'),
            1,
            df['history_and_cultural']
        )
        df['history_and_cultural'] = df.apply(
            lambda row: 1 if any(item in ['museum', 'monument', 'statue'] for item in eval(row['types'])) else row['history_and_cultural'],
            axis=1
        )
        
        df['entertainment_and_leisure'] = np.where(
            df['name'].str.lower().str.contains('wisata|park|taman'),
            1,
            df['entertainment_and_leisure']
        )
        df['entertainment_and_leisure'] = df.apply(
            lambda row: 1 if any(item in ['amusement_park', 'aquarium', 'park', 'stadium'] for item in eval(row['types'])) else row['entertainment_and_leisure'],
            axis=1
        )
        
        df['nature_and_adventure'] = np.where(
            df['name'].str.lower().str.contains('pulau|pantai|gunung|bukit|danau|island|beach|mountain|hill|lake'),
            1,
            df['nature_and_adventure']
        )
        df['nature_and_adventure'] = df.apply(
            lambda row: 1 if any(item in ['beach', 'island', 'hills', 'mountain', 'lake'] for item in eval(row['types'])) else row['nature_and_adventure'],
            axis=1
        )
        
        df['spiritual_and_architecture'] = np.where(
            df['name'].str.lower().str.contains('masjid|mosque|gereja|church|katedral|cathedral|vihara|monestary|pura|temple|shrine|kelenteng'),
            1,
            df['spiritual_and_architecture']
        )
        df['spiritual_and_architecture'] = df.apply(
            lambda row: 1 if any(item in ['mosque', 'church', 'cathedral', 'monestary', 'temple', 'shrine', 'place_of_worship'] for item in eval(row['types'])) else row['spiritual_and_architecture'],
            axis=1
        )
        
        df['wildlife_and_conservation'] = np.where(
            df['name'].str.lower().str.contains('kebun binatang|aquarium|kebun|taman nasional'),
            1,
            df['wildlife_and_conservation']
        )
        df['wildlife_and_conservation'] = df.apply(
            lambda row: 1 if any(item in ['zoo', 'aquarium', 'protected_area'] for item in eval(row['types'])) else row['wildlife_and_conservation'],
            axis=1
        )
        
        df.to_csv(file_path, index=False)
print('Done categorizing')

Historical and Cultural Sites - history_and_cultural ++
Natural Attractions and Adventure - nature_and_adventure ++
Entertainment and Leisure - entertainment_and_leisure ++
Spiritual and Architectural Sites - spiritual_and_architecure ++
Wildlife and Conservation Areas - wildlife_and_conservation ++

In [24]:
total_uncategorized = 0
total_rows = 0

for file_name in os.listdir(clean_path):
    if file_name.endswith('.csv'):
        file_path = os.path.join(clean_path, file_name)
        df = pd.read_csv(file_path)
        
        # Assuming the initial values for these columns are set to 0
        initial_value = 0
        category_columns = [
            'history_and_cultural', 
            'entertainment_and_leisure',
            'nature_and_adventure', 
            'spiritual_and_architecture', 
            'wildlife_and_conservation'
        ]

        # Calculate the number of rows still uncategorized
        uncategorized_count = df[(df[category_columns] == initial_value).all(axis=1)].shape[0]
        total_count = df.shape[0]

        # Output the ratio of uncategorized to total rows for each file
        print(f"{file_name}: {uncategorized_count}/{total_count} rows remain uncategorized")

        # Update the total counts
        total_uncategorized += uncategorized_count
        total_rows += total_count

# Print the total uncategorized and total rows across all files
print(f"Total across all files: {total_uncategorized}/{total_rows} rows remain uncategorized")

print('Finished calculating uncategorized rows.')


Kabupaten_Aceh_Barat.csv: 26/50 rows remain uncategorized
Kabupaten_Aceh_Barat_Daya.csv: 19/38 rows remain uncategorized
Kabupaten_Aceh_Besar.csv: 66/122 rows remain uncategorized
Kabupaten_Aceh_Jaya.csv: 74/140 rows remain uncategorized
Kabupaten_Aceh_Selatan.csv: 25/48 rows remain uncategorized
Kabupaten_Aceh_Singkil.csv: 50/73 rows remain uncategorized
Kabupaten_Aceh_Tamiang.csv: 49/74 rows remain uncategorized
Kabupaten_Aceh_Tengah.csv: 52/69 rows remain uncategorized
Kabupaten_Aceh_Tenggara.csv: 33/63 rows remain uncategorized
Kabupaten_Aceh_Timur.csv: 22/50 rows remain uncategorized
Kabupaten_Aceh_Utara.csv: 65/104 rows remain uncategorized
Kabupaten_Agam.csv: 145/223 rows remain uncategorized
Kabupaten_Alor.csv: 37/81 rows remain uncategorized
Kabupaten_Asahan.csv: 108/158 rows remain uncategorized
Kabupaten_Asmat.csv: 30/51 rows remain uncategorized
Kabupaten_Badung.csv: 72/123 rows remain uncategorized
Kabupaten_Balangan.csv: 44/69 rows remain uncategorized
Kabupaten_Bandung.c

In [25]:
count_over_100 = 0
count_over_200 = 0
count_over_300 = 0

for file_name in os.listdir(clean_path):
    if file_name.endswith('.csv'):
        file_path = os.path.join(clean_path, file_name)
        df = pd.read_csv(file_path)

        # Check the number of rows in the DataFrame
        total_rows = df.shape[0]

        # Increment the counters based on the number of rows
        if total_rows > 100:
            count_over_100 += 1
        if total_rows > 200:
            count_over_200 += 1
        if total_rows > 300:
            count_over_300 += 1

# Print the results
print(f"Files with over 100 rows: {count_over_100}")
print(f"Files with over 200 rows: {count_over_200}")
print(f"Files with over 300 rows: {count_over_300}")

print('Finished counting files based on row thresholds.')

Files with over 100 rows: 221
Files with over 200 rows: 79
Files with over 300 rows: 28
Finished counting files based on row thresholds.
