In [23]:
# Extracting the list of URLs that are covered in this search.
import pandas as pd

input_file = "dpart2.csv"
output_file = "urlpart2.txt"
column_name = "url"   # <-- change this to your column name

# Load CSV
df = pd.read_csv(input_file)

# Drop NaNs, get unique values, convert to list
unique_values = df[column_name].dropna().unique().tolist()

# Write to text file, one per line
with open(output_file, "w", encoding="utf-8") as f:
    for value in unique_values:
        f.write(str(value) + "\n")

print(f"Extracted {len(unique_values)} unique values to {output_file}")

# Define file paths
file_a = "urlpart2.txt"
file_b = "remainingurls.txt"
file_c = "remainingurls.txt"

# Read values from files into sets
with open(file_a, "r", encoding="utf-8") as f:
    set_a = set(line.strip() for line in f if line.strip())

with open(file_b, "r", encoding="utf-8") as f:
    set_b = set(line.strip() for line in f if line.strip())

# Compute difference: values in b but not in a
diff = sorted(set_b - set_a)

# Write result to file c
with open(file_c, "w", encoding="utf-8") as f:
    for value in diff:
        f.write(value + "\n")

print(f"Found {len(diff)} values in b but not in a. Results saved to {file_c}")


Extracted 410 unique values to urlpart2.txt
Found 3822 values in b but not in a. Results saved to remainingurls.txt


In [13]:
# Merge multiple category columns into a single column in a CSV file
import csv

input_file = "C:\\Users\\angel\\2025\\Others\\TechJam\\Data\\just remaining categories.csv"
output_file = "cpart2.csv"

# Define the category columns you want to merge
category_columns = [f"categories/{i}" for i in range(11)]

with open(input_file, mode="r", newline='', encoding="utf-8") as infile, \
     open(output_file, mode="w", newline='', encoding="utf-8") as outfile:

    reader = csv.DictReader(infile)
    
    # Keep all other fields except the category ones
    other_fields = [field for field in reader.fieldnames if field not in category_columns]
    
    # Output fields: all other fields + one merged 'categories' field
    fieldnames = other_fields + ['category']
    writer = csv.DictWriter(outfile, fieldnames=fieldnames)
    writer.writeheader()

    for row in reader:
        # Extract non-empty, trimmed category values
        categories = [row[col].strip() for col in category_columns if row[col].strip()]
        
        # Build new row with original fields
        new_row = {field: row[field] if row[field] != "" else "" for field in other_fields}

        # Always stringify the list
        new_row["category"] = f"{categories}"

        writer.writerow(new_row)





In [19]:
# Rename searchString column to url in a CSV file

import pandas as pd

df = pd.read_csv("dpart2.csv")

# Option 1: rename a specific column
df = df.rename(columns={"searchString": "url"})

df.to_csv("dpart2.csv", index=False)


In [20]:
# Remove the prefix "Direct Detail URL: " from the url column using pandas
import pandas as pd

# Load your file
df = pd.read_csv("dpart2.csv")

# Remove the prefix "Direct Detail URL: " from the url column
df['url'] = df['url'].str.replace(r"^Direct Detail URL:\s*", "", regex=True)

# Save back to a new file
df.to_csv("dpart2.csv", index=False)

print("Cleaned file saved as dpart2.csv")

Cleaned file saved as dpart2.csv


In [22]:
# Adding missing descriptions and categories to main reviews file

import pandas as pd

# File paths 
main_file = "vermont_text_merged.csv"  
supp_file = "dcpart1.csv"  # the uploaded supplemental file
output_file = "vermont_text_merged.csv"

# Load both files
main_df = pd.read_csv(main_file)
supp_df = pd.read_csv(supp_file)

# Ensure consistent column names (lowercase, strip spaces)
main_df.columns = [col.strip().lower() for col in main_df.columns]
supp_df.columns = [col.strip().lower() for col in supp_df.columns]

# We assume both have at least these columns: url, description, categories
for idx, row in supp_df.iterrows():
    url = row['url']
    
    # Find matching rows in main file
    matches = main_df['url'] == url
    
    if matches.any():
        # Update description if main file missing
        
        if pd.notna(row['description']) and row['description'].strip() != "":
            main_df.loc[matches & (main_df['description'].isna() | (main_df['description'].str.strip() == "")),
                        'description'] = row['description']
        
        """
        # Update categories if main file missing
        if pd.notna(row['category']) and row['category'].strip() != "":
            main_df.loc[matches & (main_df['category'].isna() | (str(main_df['category']).strip() == "")),
                        'category'] = row['category']
        """

# Save the updated file
main_df.to_csv(output_file, index=False)

print(f"Updated file saved as {output_file}")


Updated file saved as vermont_text_merged.csv
