In [2]:
import os
import pandas as pd

# Path to your CSV file
csv_file = 'data-v5/test/_annotations.csv'

# Path to the directory containing the images to exclude
exclude_dir = 'data-v5/test/their_test'

# Path to save the excluded rows
excluded_csv = 'data-v5/test/their_test/_annotations.csv'

# Get a list of image filenames (without path) in the exclude directory
exclude_images = {os.path.basename(file) for file in os.listdir(exclude_dir)}

# Load the CSV file into a DataFrame
df = pd.read_csv(csv_file)

# Split the DataFrame into rows to keep and rows to remove
df_to_remove = df[df['filename'].isin(exclude_images)]
filtered_df = df[~df['filename'].isin(exclude_images)]

# Save the filtered DataFrame back to the CSV file
filtered_df.to_csv(csv_file, index=False)

# Save the removed rows into a new CSV file
df_to_remove.to_csv(excluded_csv, index=False)

print(f"Rows associated with images in {exclude_dir} have been removed.")

Rows associated with images in data-v5/test/their_test have been removed.


In [3]:
import os
import pandas as pd

# Directories to process
directories = ['data-v5/test', 'data-v5/train', 'data-v5/valid']

# Function to calculate the area of an element
def calculate_area(row):
    return (row['xmax'] - row['xmin']) * (row['ymax'] - row['ymin'])

# Process each directory
for directory in directories:
    # Load the _annotations.csv file
    annotations_file = os.path.join(directory, '_annotations.csv')
    df = pd.read_csv(annotations_file)

   
    
    # Extract the pothole ID
    df['pothole_id'] = df['filename'].str.extract(r'p(\d+)_')[0]

    
    
    # Calculate the area for each class
    df['area'] = (df['xmax'] - df['xmin']) * (df['ymax'] - df['ymin'])
    
    # Pivot the data to get separate columns for pothole area and stick area
    pivot_df = df.pivot_table(index='pothole_id', columns='class', values='area', aggfunc='first').reset_index()
    
    # Rename the columns for clarity
    pivot_df = pivot_df.rename(columns={'potholes': 'pothole_area', 'L': 'stick_area'})
    print(pivot_df.head())
    # Save the result to a new CSV file
    output_file = os.path.join(directory, f'{os.path.basename(directory)}.csv')
    pivot_df.to_csv(output_file, index=False)

    print(f"Processed {annotations_file} and saved results to {output_file}")


class pothole_id  stick_area  pothole_area
0           1078      4165.0       27830.0
1           1087      3358.0       37653.0
2            113     13266.0       34452.0
3           1137      5921.0       46438.0
4           1143     59640.0       29892.0
Processed data-v5/test/_annotations.csv and saved results to data-v5/test/test.csv
class pothole_id  stick_area  pothole_area
0           1008     12772.0      265080.0
1           1009     21150.0       46222.0
2            101      9690.0      181656.0
3           1018     17820.0      240006.0
4           1019     50400.0      214812.0
Processed data-v5/train/_annotations.csv and saved results to data-v5/train/train.csv
class pothole_id  stick_area  pothole_area
0           1021      9039.0      255468.0
1           1023     29754.0       70196.0
2           1031     45210.0      281050.0
3           1032      3762.0       42672.0
4           1034      5053.0        6237.0
Processed data-v5/valid/_annotations.csv and saved result

In [4]:
import os
import pandas as pd

# Directories to process
directories = ['data-v5/test', 'data-v5/train', 'data-v5/valid']

# Known dimensions of the stick in millimeters
stick_length_mm = 500
stick_width_mm = 4

# Stick area in square millimeters
stick_area_mm2 = stick_length_mm * stick_width_mm

# Process each directory
for directory in directories:
    # Load the corresponding CSV file
    csv_file = os.path.join(directory, f'{os.path.basename(directory)}.csv')
    df = pd.read_csv(csv_file)
    
    # Calculate the area of the stick in pixels (already computed in previous steps)
    df['stick_area'] = df['stick_area'].fillna(1)  # To avoid division by zero or NaN
    
    # Calculate the conversion factor from pixels to mm²
    df['conversion_factor'] = stick_area_mm2 / df['stick_area']
    
    # Convert the pothole area to square millimeters
    df['pothole_area_mm2'] = df['pothole_area'] * df['conversion_factor']
    
    # Save the updated DataFrame back to the CSV file
    df.to_csv(csv_file, index=False)

    print(f"Processed {csv_file} and converted pothole areas to square millimeters.")


Processed data-v5/test/test.csv and converted pothole areas to square millimeters.
Processed data-v5/train/train.csv and converted pothole areas to square millimeters.
Processed data-v5/valid/valid.csv and converted pothole areas to square millimeters.


In [24]:
import os
import pandas as pd

# Path to the train_labels.csv file
train_labels_file = 'data-v5/train_labels_2.csv'
# train_labels_file = 'data-v5/train_labels.csv'

# Load the train_labels.csv file
train_labels_df = pd.read_csv(train_labels_file)
train_labels_df = train_labels_df.rename(columns={'Pothole number': 'pothole_id'})

# Directories to process
directories = ['data-v5/test', 'data-v5/train', 'data-v5/valid']

# Process each directory
for directory in directories:
    # Load the corresponding CSV file
    csv_file = os.path.join(directory, f'{os.path.basename(directory)}.csv')
    df = pd.read_csv(csv_file)
    # Merge with train_labels.csv data
    merged_df = pd.merge(df, train_labels_df, how='left', on='pothole_id')

    # Combine 'Bags used _x' and 'Bags used _y' into 'Bags used'
    merged_df['Bags used'] = merged_df['Bags used _x'].combine_first(merged_df['Bags used _y'])

    # Drop the now redundant 'Bags used _x' and 'Bags used _y' columns
    merged_df = merged_df.drop(['Bags used _x', 'Bags used _y'], axis=1)
    
    # Save the updated DataFrame back to the CSV file
    merged_df.to_csv(csv_file, index=False)

    print(f"Processed {csv_file} and added bag usage information.")



data-v5/test
    pothole_id  stick_area  pothole_area  conversion_factor  pothole_area_mm2  \
0         1078      4165.0       27830.0           0.480192      1.336375e+04   
1         1087      3358.0       37653.0           0.595593      2.242585e+04   
2          113     13266.0       34452.0           0.150761      5.194030e+03   
3         1137      5921.0       46438.0           0.337781      1.568586e+04   
4         1143     59640.0       29892.0           0.033535      1.002414e+03   
5         1147     18150.0       46618.0           0.110193      5.136970e+03   
6         1154     22967.0       19448.0           0.087081      1.693560e+03   
7          117      5346.0      135850.0           0.374111      5.082305e+04   
8         1183     51906.0       25454.0           0.038531      9.807729e+02   
9         1190      3140.0       37814.0           0.636943      2.408535e+04   
10        1194      3460.0        4224.0           0.578035      2.441618e+03   
11        1219 

MergeError: Passing 'suffixes' which cause duplicate columns {'Bags used _x'} is not allowed.

In [30]:
import os
import pandas as pd

# Directories to process
directories = ['data-v5/test', 'data-v5/train', 'data-v5/valid']

# Process each directory
for directory in directories:
    # Load the corresponding CSV file
    csv_file = os.path.join(directory, f'{os.path.basename(directory)}.csv')
    df = pd.read_csv(csv_file)
    
    df = df.drop(['stick_area','pothole_area','conversion_factor'], axis=1)

    # Save the updated DataFrame back to the CSV file
    df.to_csv(csv_file, index=False)

    print(f"Processed {csv_file} and added bag usage information.")


Processed data-v5/test/test.csv and added bag usage information.
Processed data-v5/train/train.csv and added bag usage information.
Processed data-v5/valid/valid.csv and added bag usage information.


In [46]:
import pandas as pd

csv_file = 'their_test.csv'

df = pd.read_csv(csv_file)

df.drop(['stick_area',  'pothole_area',  'conversion_factor'], axis=1)
print(df)

# Save the updated DataFrame back to the CSV file
df.to_csv('their_test.csv', index=False)

print(f"Processed {csv_file} and converted pothole areas to square millimeters.")


    pothole_id  stick_area  pothole_area  conversion_factor  pothole_area_mm2
0          103        3082         52668           0.648929      34177.806619
1          104        4095         65880           0.488400      32175.824176
2         1040        5704         29029           0.350631      10178.471248
3          105        4114         89516           0.486145      43517.744288
4          108        2505        110880           0.798403      88526.946108
5         1086        8272         39498           0.241779       9549.806576
6         1115       14355         43362           0.139324       6041.379310
7         1134        6426         27000           0.311236       8403.361345
8          114        5005        105376           0.399600      42108.291708
9         1161        5796          9090           0.345066       3136.645963
10        1162       25654         10192           0.077961        794.573946
11        1181        4375         39334           0.457143     