In [None]:
!pip install pandas


In [None]:
import pandas as pd
import glob
import os

## Attach the end location home information

In [None]:
# Define the file path
file_path = '/Users/aparnaj8/Box/InTrans/RWRAD_Internal/Souradeep_Trip_Info/RWRAD_all_label.csv'

# Read the CSV file into a DataFrame
labels_df = pd.read_csv(file_path)

In [None]:
labels_df.head()

In [None]:
# Drop column 'B'
labels_df = labels_df.drop(columns=["Unnamed: 0"])

In [None]:
import pandas as pd
import os

# Define the path to the folder containing GPS files
folder_path = r"C:\Users\aparnaj8\Box\Data (Matthew Rizzo)\DataAnalysis\ReferenceDatabases\BlackBox_GPSDrivingLocations\BlackBox_GPSDrivingLocations_1hz"

# Initialize a list to store the final results and processed files
results = []
processed_files = []  # List to track processed files

# Iterate over each unique subject in labels_df
for subj in labels_df['subj'].unique():
    # Construct the file path for the subject's GPS file
    file_name = f"{subj}_BlackBox_GPSDrivingLocations_1hz.csv"
    file_path = os.path.join(folder_path, file_name)

    # Check if the file exists
    if os.path.exists(file_path):
        print(f"Started processing file: {file_name}")  # Announce processing start

        # Load the GPS file into a DataFrame
        gps_df = pd.read_csv(file_path)

        # Ensure time columns are in datetime format for proper comparison
        labels_df['time_utc'] = pd.to_datetime(labels_df['time_utc'])
        gps_df['time_utc'] = pd.to_datetime(gps_df['time_utc'])

        # Filter gps_df for rows where time_cat = 'end'
        gps_df_filtered = gps_df[gps_df['time_cat'] == 'end']

        # Filter the labels_df for the current subject
        subj_labels = labels_df[labels_df['subj'] == subj]

        # Perform a left merge to retain all rows from labels_df
        merged_df = pd.merge(
            subj_labels,
            gps_df_filtered[['time_utc', 'drive', 'gps_lat', 'gps_long']],  # Columns to match and attach
            on='time_utc',
            how='left'  # Use 'left' to keep all rows from labels_df
        )

        # Append the merged data to the results list
        results.append(merged_df)

        # Mark the file as processed
        processed_files.append(file_name)
    else:
        print(f"File not found: {file_path}")

# Combine all results into a single DataFrame
if results:
    final_df = pd.concat(results, ignore_index=True)
else:
    final_df = pd.DataFrame()  # Handle case where no data is processed

# Display the final DataFrame
print(final_df)

# Display the list of processed files
print("\nFiles Processed:")
for file in processed_files:
    print(f" - {file}")




In [None]:
final_df.head()

In [None]:
len(labels_df)

In [None]:
# Count the total number of NaN values in each column
nan_counts_per_column = final_df.isna().sum()

# Print the count of NaN values for each column
print("Count of NaN values per column:")
print(nan_counts_per_column)

# Count the total number of rows with any NaN values
rows_with_nan = final_df.isna().any(axis=1).sum()
print(f"\nTotal number of rows with NaN values: {rows_with_nan}")


In [None]:
# Save the final DataFrame to a CSV file (if needed)
output_path = r"/Users/aparnaj8/Box/InTrans/RWRAD_Internal/Trip_variable/RWRAD_end_trip_info.csv"
final_df.to_csv(output_path, index=False)

In [None]:
final_df.head()

In [None]:
final_df['subj'].nunique()

##  Extract only home location

In [None]:
# Define the file path
file_path = r"/Users/aparnaj8/Box/InTrans/RWRAD_Internal/Trip_variable/RWRAD_end_trip_info.csv"

# Read the CSV file into a DataFrame
final_df= pd.read_csv(file_path)

In [None]:
final_df.head()

In [None]:
# Filter the DataFrame for rows where the 'labels' column is 'home'
home_df = final_df[final_df['labels'] == 'home']
home_df.head()


In [None]:
len(home_df)

In [None]:
home_df['subj'].nunique()

## Extract first value of lat and long for every subject

In [None]:
# Group by 'subj' and keep the first entry for each subject based on gps_lat and gps_long
home_df_first_entry = home_df.groupby('subj').first().reset_index()

In [None]:
home_df_first_entry

## Calculate the distance for all Csvs ( some csv files are misisng due to lack of home info)

In [None]:
import os
import pandas as pd
from geopy.distance import geodesic

# Initialize the final output dataframe
final_output_df = pd.DataFrame(columns=['subj', 'drive', 'distance', 'label'])

# Path to the GPS data files
gps_files_path = r"C:\Users\aparnaj8\Box\Data (Matthew Rizzo)\DataAnalysis\ReferenceDatabases\BlackBox_GPSDrivingLocations\BlackBox_GPSDrivingLocations_1hz"

# Get all CSV files from the folder
gps_files = [file for file in os.listdir(gps_files_path) if file.endswith(".csv")]

# Iterate through all CSV files
for file in gps_files:
    # Load the current GPS file
    gps_df = pd.read_csv(os.path.join(gps_files_path, file))
    gps_df = gps_df[gps_df['time_cat'] == 'start']  # Filter for 'start' only

    # Print which file is being processed
    print(f"Processing file: {file}")

    # Iterate over each row in home_df_first_entry to match with GPS data
    for index, row in home_df_first_entry.iterrows():
        # Match the subject in the GPS data
        matching_gps = gps_df[gps_df['subj'] == row['subj']]

        if not matching_gps.empty:
            # Calculate the geodesic distance for each match if coordinates are valid
            home_lat = row['gps_lat']
            home_lon = row['gps_long']

            # Skip if the home coordinates are missing or invalid
            if pd.isna(home_lat) or pd.isna(home_lon):
                continue

            home_point = (home_lat, home_lon)
            
            # For each matching entry, calculate the distance and label
            for _, gps_row in matching_gps.iterrows():
                gps_lat = gps_row['gps_lat']
                gps_lon = gps_row['gps_long']
                
                # Skip if GPS coordinates are missing or invalid
                if pd.isna(gps_lat) or pd.isna(gps_lon):
                    continue

                gps_point = (gps_lat, gps_lon)

                # Calculate distance in feet
                distance = geodesic(home_point, gps_point).feet
                
                # Assign label based on distance
                label = 'home' if distance < 100 else 'not_home'
                
                # Prepare the new row as a DataFrame
                new_row = pd.DataFrame({
                    'subj': [row['subj']],
                    'drive': [gps_row['drive']],
                    'distance': [distance],
                    'label': [label]
                })
                
                # Concatenate the new row to the final dataframe
                final_output_df = pd.concat([final_output_df, new_row], ignore_index=True)


## Check duplicate count of subj and drive

In [None]:
# Check for duplicates based on 'subj' and 'drive'
duplicates = final_output_df[final_output_df.duplicated(subset=['subj', 'drive'], keep=False)]

# Count the number of duplicates
duplicate_count = duplicates.shape[0]  # or len(duplicates)
print(f"Number of duplicate combinations of 'subj' and 'drive': {duplicate_count}")

# Optionally, you can display the duplicate rows for inspection
print(duplicates[['subj', 'drive']].head())


In [None]:
# Get the count of unique 'subj' values
unique_subj_count = final_output_df['subj'].nunique()

# Print the result
print(f"Number of unique subjects: {unique_subj_count}")


## Save the dataframe as csv

In [None]:
# Optionally, save the final dataframe to a CSV
final_output_df.to_csv("RWRAD_start_trip_home_info.csv", index=False)

## Check the output file

In [None]:
len(final_output_df)

In [None]:
final_output_df.head()

In [None]:
home_count = final_output_df[final_output_df['label'] == 'home'].shape[0]
print(home_count)


In [None]:
# Filter the rows where the label is 'home'
home_label_df = final_output_df[final_output_df['label'] == 'home']

# Find the row with the maximum distance
max_distance_row = home_label_df.loc[home_label_df['distance'].idxmax()]

# Display the row with the maximum distance
print(max_distance_row)


In [None]:
# Filter rows where distance is between 50 and 100 feet inclusive
filtered_df = final_output_df[(final_output_df['distance'] >= 50) & (final_output_df['distance'] <= 100)]

# Display the filtered DataFrame
print(filtered_df)


## Test code for one csv file

In [None]:
import os
import pandas as pd
import numpy as np
from geopy.distance import geodesic


# Initialize the final output dataframe
final_output_df = pd.DataFrame(columns=['subj', 'drive', 'distance', 'label'])

# Path to the GPS data files
gps_files_path = r"C:\Users\aparnaj8\Box\Data (Matthew Rizzo)\DataAnalysis\ReferenceDatabases\BlackBox_GPSDrivingLocations\BlackBox_GPSDrivingLocations_1hz"

# Get the first CSV file from the folder
gps_files = [file for file in os.listdir(gps_files_path) if file.endswith(".csv")]
first_file = gps_files[0]  # The first CSV file

# Process the first file only
gps_df = pd.read_csv(os.path.join(gps_files_path, first_file))
gps_df = gps_df[gps_df['time_cat'] == 'start']  # Filter for 'start' only

# Print which file is being processed
print(f"Processing file: {first_file}")

# Iterate over each row in home_df_first_entry to match with GPS data
for index, row in home_df_first_entry.iterrows():
    # Match the subject in the GPS data
    matching_gps = gps_df[gps_df['subj'] == row['subj']]

    if not matching_gps.empty:
        # Calculate the geodesic distance for each match
        home_point = (row['gps_lat'], row['gps_long'])
        
        # For each matching entry, calculate the distance and label
        for _, gps_row in matching_gps.iterrows():
            gps_point = (gps_row['gps_lat'], gps_row['gps_long'])
            
            # Calculate distance in feet
            distance = geodesic(home_point, gps_point).feet
            
            # Assign label based on distance
            label = 'home' if distance < 50 else 'not_home'
            
            # Prepare the new row as a DataFrame
            new_row = pd.DataFrame({
                'subj': [row['subj']],
                'drive': [gps_row['drive']],
                'distance': [distance],
                'label': [label]
            })
            
            # Concatenate the new row to the final dataframe
            final_output_df = pd.concat([final_output_df, new_row], ignore_index=True)



## find the home information for the remaining subjects 

In [None]:
file_path = 'C:\\Users\\aparnaj8\\Box\\Data (Matthew Rizzo)\\DataAnalysis\\FormattingQAChecks\\REDCap_Formatting\\REDCap_Format_Data\\dlq.csv'

# Read the CSV file into a DataFrame
dlq_df = pd.read_csv(file_path)

In [None]:
dlq_df.head()

In [None]:
# Filter the DataFrame to get rows where 'type' is 'home'
home_df = dlq_df[dlq_df['type'] == 'home']

# Get unique combinations of 'subj' and 'type'
unique_home_combinations = home_df[['subj', 'type']].drop_duplicates()

# Merge the unique combinations with the original DataFrame to get 'gps_lat' and 'gps_long'
home_gps_info = pd.merge(unique_home_combinations, dlq_df[['subj', 'type', 'gps_lat', 'gps_long']], on=['subj', 'type'], how='left')


In [None]:
home_gps_info.head()

In [None]:
# Filter the 'home_gps_info' DataFrame to keep only those rows where 'subj' is not in 'final_output_df'
filtered_home_gps_info = home_gps_info[~home_gps_info['subj'].isin(home_df_first_entry['subj'])]

In [None]:
len(filtered_home_gps_info)

In [None]:
filtered_home_gps_info.head()

In [None]:
# Save the filtered DataFrame to CSV
filtered_home_gps_info.to_csv('home_information_for_remaining_subjects.csv', index=False)

## Do the same process for the remaining csv files

In [None]:
import os
import pandas as pd
from geopy.distance import geodesic

# Initialize the final output dataframe
final_home_gps_info = pd.DataFrame(columns=['subj', 'drive', 'distance', 'label'])

# Path to the GPS data files
gps_files_path = r"C:\Users\aparnaj8\Box\Data (Matthew Rizzo)\DataAnalysis\ReferenceDatabases\BlackBox_GPSDrivingLocations\BlackBox_GPSDrivingLocations_1hz"

# Get all CSV files from the folder
gps_files = [file for file in os.listdir(gps_files_path) if file.endswith(".csv")]

# Iterate through all CSV files
for file in gps_files:
    # Load the current GPS file
    gps_df = pd.read_csv(os.path.join(gps_files_path, file))
    gps_df = gps_df[gps_df['time_cat'] == 'start']  # Filter for 'start' only

    # Print which file is being processed
    print(f"Processing file: {file}")

    # Iterate over each row in filtered_home_gps_info to match with GPS data
    for index, row in filtered_home_gps_info.iterrows():
        # Match the subject in the GPS data
        matching_gps = gps_df[gps_df['subj'] == row['subj']]

        if not matching_gps.empty:
            # Calculate the geodesic distance for each match if coordinates are valid
            home_lat = row['gps_lat']
            home_lon = row['gps_long']

            # Skip if the home coordinates are missing or invalid
            if pd.isna(home_lat) or pd.isna(home_lon):
                continue

            home_point = (home_lat, home_lon)
            
            # For each matching entry, calculate the distance and label
            for _, gps_row in matching_gps.iterrows():
                gps_lat = gps_row['gps_lat']
                gps_lon = gps_row['gps_long']
                
                # Skip if GPS coordinates are missing or invalid
                if pd.isna(gps_lat) or pd.isna(gps_lon):
                    continue

                gps_point = (gps_lat, gps_lon)

                # Calculate distance in feet
                distance = geodesic(home_point, gps_point).feet
                
                # Assign label based on distance
                label = 'home' if distance < 100 else 'not_home'
                
                # Prepare the new row as a DataFrame
                new_row = pd.DataFrame({
                    'subj': [row['subj']],
                    'drive': [gps_row['drive']],
                    'distance': [distance],
                    'label': [label]
                })
                
                # Concatenate the new row to the final dataframe
                final_home_gps_info = pd.concat([final_home_gps_info, new_row], ignore_index=True)



In [None]:
final_home_gps_info.head()

In [None]:
# Save the filtered DataFrame to CSV
final_home_gps_info.to_csv('RWRAD_start_trip_home_info_for_remaining_subjects.csv', index=False)

## Read the merged file

In [None]:
# Define the file path
file_path = '/Users/aparnaj8/Box/InTrans/RWRAD_Internal/Trip_variable/RWRAD_start_trip_info_allsubjects.csv'

# Read the CSV file into a DataFrame
all_df = pd.read_csv(file_path)

In [None]:
all_df .head()

In [None]:
# Get the unique 'subj' values from all_df
unique_subj = all_df['subj'].unique()


# Optional: You can also get the count of unique subjects
unique_subj_count = len(unique_subj)
print(f"Number of unique subjects: {unique_subj_count}")

## Filter only those trips which started from home

In [None]:
# Filter rows where label is 'home'
home_df = all_df[all_df['label'] == 'home']

# Optional: Get the count of rows where label is 'home'
home_count = home_df.shape[0]
print(f"Number of rows with label 'home': {home_count}")


In [None]:
# Get the unique 'subj' values from all_df
unique_subj = home_df['subj'].unique()

# Optional: You can also get the count of unique subjects
unique_subj_count = len(unique_subj)
print(f"Number of unique subjects: {unique_subj_count}")

In [None]:
home_df.head()

In [None]:
# Save the filtered DataFrame to CSV
home_df.to_csv('RWRAD_start_trip_home_only_for_all_subjects.csv', index=False)

## Find trips which are <15 or 25 miles from home

In [None]:
# Define the file path
file_path = '/Users/aparnaj8/Box/Data (Matthew Rizzo)\DataAnalysis\FormattingQAChecks\BlackBox_FormattingQAChecks\BlackBox_FormatQA_Data\DataSummaries_1hz/BlackBox_1hz_byDriveSummary.csv'

# Read the CSV file into a DataFrame
drive_summary_df= pd.read_csv(file_path)

In [None]:
drive_summary_df['time_cat'].unique()

### Save only required variables


In [None]:
# Keep only the specified columns
filtered_drive_summary_df = drive_summary_df[['subj', 'drive', 'time_start_utc', 'time_end_utc', 'time_start_cst', 'time_end_cst', 'time_weekday', 'duration_minutes', 'distance_miles', 'speed_mph_mean']]


In [None]:
filtered_drive_summary_df.head()

In [None]:
# Merge the two DataFrames on 'subj' and 'drive'
merged_df = pd.merge(filtered_drive_summary_df, home_df[['subj', 'drive', 'distance', 'label']], on=['subj', 'drive'], how='left')

# Rename the 'distance' column to 'calc_distance'
merged_df.rename(columns={'distance': 'home_distance'}, inplace=True)



In [None]:
merged_df.head()

In [None]:
len(merged_df)

# Add 15 miles and 25 miles column

In [None]:
import numpy as np

# Create a new column '15_miles_from_home' where the distance is less than 15 miles and label is 'home'
merged_df['15_miles_from_home'] = merged_df.apply(
    lambda row: 'yes' if row['label'] == 'home' and row['distance_miles'] < 15 else ('no' if row['label'] == 'home' else np.nan), axis=1)

# Create a new column '25_miles_from_home' where the distance is less than 25 miles and label is 'home'
merged_df['25_miles_from_home'] = merged_df.apply(
    lambda row: 'yes' if row['label'] == 'home' and row['distance_miles'] < 25 else ('no' if row['label'] == 'home' else np.nan), axis=1)



In [None]:
# List of columns to convert to datetime
columns_to_convert = ['time_start_utc', 'time_end_utc', 'time_start_cst', 'time_end_cst']  # Replace with actual column names

# Convert specified columns to datetime format
for column in columns_to_convert:
    merged_df[column] = pd.to_datetime(merged_df[column], errors='coerce')



In [None]:
merged_df.dtypes

In [None]:
# Save the filtered DataFrame to CSV
merged_df.to_csv('RWRAD_15_25_miles_from_home.csv', index=False)

In [None]:
# Count occurrences of 'yes', 'no', and NaN for the '15_miles_from_home' column
count_15_miles = merged_df['15_miles_from_home'].value_counts(dropna=False)

# Count occurrences of 'yes', 'no', and NaN for the '25_miles_from_home' column
count_25_miles = merged_df['25_miles_from_home'].value_counts(dropna=False)

# Display the counts
print("Count for '15_miles_from_home':")
print(count_15_miles)

print("\nCount for '25_miles_from_home':")
print(count_25_miles)


In [None]:
len(merged_df)

## Attach Trip Chaining Information

In [None]:
# Define the file path
file_path = '/Users/aparnaj8/Box/InTrans/RWRAD_Internal/Final_files_with_variables/Trip_information/RWRAD_15_25_miles_from_home_week_info.csv'

# Read the CSV file into a DataFrame
trip_summary_df= pd.read_csv(file_path)

In [None]:
trip_summary_df.head()

In [None]:
# Define the file path
file_path = '/Users/aparnaj8/Box/InTrans/RWRAD_Internal/Trip_variable/soura_start_end_rwrad.csv'

# Read the CSV file into a DataFrame
start_df= pd.read_csv(file_path)

In [None]:
# Filter the dataframe based on the conditions
start_df_filtered = start_df[(start_df['time_cat'] == 'start')]

In [None]:
start_df_filtered.head()

In [None]:
# Define the file path
file_path = '/Users/aparnaj8/Box/InTrans/RWRAD_Internal/Souradeep_Trip_Info/RWRAD_all_label_wdrive.csv'

# Read the CSV file into a DataFrame
end_df= pd.read_csv(file_path)

In [None]:
# Get the unique 'subj' values from all_df
unique_subj = end_df['labels'].unique()
unique_subj

In [None]:
end_df.head()

In [None]:
trip_summary_df.head()

#### Attach start and end home information to the dataframe

In [None]:
# Add start_location and end_location columns with default value
trip_summary_df["start_location"] = None
trip_summary_df["end_location"] = None

# Create a mapping dictionary for end_df (subj, drive) -> label
start_location_mapping = start_df_filtered.set_index(["subj", "drive"])["labels"].to_dict()

# Update the end_location column using the mapping
trip_summary_df["start_location"] = trip_summary_df.set_index(["subj", "drive"]).index.map(start_location_mapping)

# Create a mapping dictionary for end_df (subj, drive) -> label
end_location_mapping = end_df.set_index(["subj", "drive"])["labels"].to_dict()

# Update the end_location column using the mapping
trip_summary_df["end_location"] = trip_summary_df.set_index(["subj", "drive"]).index.map(end_location_mapping)

# Convert None to a default value, e.g., "unknown" (if needed)
trip_summary_df["end_location"].fillna("None", inplace=True)

# Convert None to a default value, e.g., "unknown" (if needed)
trip_summary_df["start_location"].fillna("None", inplace=True)

In [None]:
trip_summary_df.head()

#### Count the drives which started at home but ended somewhere else

In [None]:
# Count where start_location is "home" and end_location is None
count_start_home_end_none = trip_summary_df[
    (trip_summary_df["start_location"] == "home") & 
    (trip_summary_df["end_location"]!= "home")
].drop_duplicates(subset=["subj", "drive"]).shape[0]

# Count where start_location is "home" and end_location is "home"
count_start_home_end_home = trip_summary_df[
    (trip_summary_df["start_location"] == "home") & 
    (trip_summary_df["end_location"] == "home")
].drop_duplicates(subset=["subj", "drive"]).shape[0]

# Print results
print(f"Count of (start_location = home, end_location = None): {count_start_home_end_none}")
print(f"Count of (start_location = home, end_location = home): {count_start_home_end_home}")


In [None]:
len(trip_summary_df)

#### calculate time difference or stoppage between drives


In [None]:
import pandas as pd

# Ensure the columns are datetime objects
trip_summary_df["time_start_utc"] = pd.to_datetime(trip_summary_df["time_start_utc"])
trip_summary_df["time_end_utc"] = pd.to_datetime(trip_summary_df["time_end_utc"])

# Sort the DataFrame by subject and drive
trip_summary_df = trip_summary_df.sort_values(by=["subj", "drive"])

# Define a function to compute time differences within groups
def calculate_time_diff(group):
    group = group.sort_values(by="drive")  # Ensure drives are sorted
    group["time_diff_minutes"] = (
        group["time_start_utc"] - group["time_end_utc"].shift(1)
    ).dt.total_seconds() / 60
    return group

# Apply the function to each group and reset the index
trip_summary_df = trip_summary_df.groupby("subj", group_keys=False).apply(calculate_time_diff)



In [None]:
trip_summary_df.head()


In [None]:
# Filter rows where both start_location and end_location are "home"
home_filter = (trip_summary_df["start_location"] == "home") & (trip_summary_df["end_location"] == "home")

# Get the row with the minimum value of distance_miles
min_distance_row = trip_summary_df.loc[home_filter, "distance_miles"].idxmin()

# Get the row with the maximum value of distance_miles
max_distance_row = trip_summary_df.loc[home_filter, "distance_miles"].idxmax()

# Get the row with the minimum value of time_diff_minutes
min_time_row = trip_summary_df.loc[home_filter, "time_diff_minutes"].idxmin()

# Get the row with the maximum value of time_diff_minutes
max_time_row = trip_summary_df.loc[home_filter, "time_diff_minutes"].idxmax()

# Print the subj and drive information for these rows
min_max_info = {
    "Min Distance Miles": trip_summary_df.loc[min_distance_row, ["subj", "drive", "distance_miles"]],
    "Max Distance Miles": trip_summary_df.loc[max_distance_row, ["subj", "drive", "distance_miles"]],
    "Min Time Diff": trip_summary_df.loc[min_time_row, ["subj", "drive", "time_diff_minutes"]],
    "Max Time Diff": trip_summary_df.loc[max_time_row, ["subj", "drive", "time_diff_minutes"]]
}

# Display the result
for key, value in min_max_info.items():
    print(f"{key}: {value}\n")


#### calculate the trip chain information

In [None]:
import pandas as pd

# Ensure the DataFrame is sorted for sequential processing
trip_summary_df = trip_summary_df.sort_values(by=["subj", "drive"])

# Initialize the `trip_chain` column
trip_summary_df['trip_chain'] = None

# Initialize global chain number
global_chain_number = 0

# Iterate over each unique subject
for subj in trip_summary_df['subj'].unique():
    # Filter trips for the current subject
    subj_df = trip_summary_df[trip_summary_df['subj'] == subj]
    
    # Variables to track chain state
    in_chain = False  # Flag for active chain
    
    for idx, row in subj_df.iterrows():
        # Case 1: Standalone trips starting and ending at home
        if row['start_location'] == 'home' and row['end_location'] == 'home':
            global_chain_number += 1  # Increment global chain number
            trip_summary_df.at[idx, 'trip_chain'] = global_chain_number  # Assign chain number
            in_chain = False  # Reset chain flag

        # Case 2: Trip starts at home but does not end at home
        elif row['start_location'] == 'home' and row['end_location'] != 'home':
            if not in_chain:  # Start a new chain if not already in one
                global_chain_number += 1
            trip_summary_df.at[idx, 'trip_chain'] = global_chain_number
            in_chain = True  # Mark that we're in a chain

        # Case 3: Trip does not start at home but ends at home
        elif row['start_location'] != 'home' and row['end_location'] == 'home':
            if in_chain:  # If already in a chain, continue it
                trip_summary_df.at[idx, 'trip_chain'] = global_chain_number
                in_chain = False  # End the chain

        # Case 4: Other trips
        else:
            if in_chain:  # Continue the current chain
                trip_summary_df.at[idx, 'trip_chain'] = global_chain_number


In [None]:
trip_summary_df.head()

In [None]:
# Save the filtered DataFrame to CSV
#trip_summary_df.to_csv('trip_chain.csv', index=False)

### Take into account dwell time =30min to terminate the trip

In [None]:
# Define the file path
file_path = '/Users/aparn/Box/InTrans/RWRAD_Internal/Final_files_with_variables/Trip_chain/trip_chain_v2.csv'

# Read the CSV file into a DataFrame
trip_summary_df= pd.read_csv(file_path)

In [None]:
trip_summary_df.head(10)

In [None]:
# Extract the specified columns into a new DataFrame
trip_chain_check = trip_summary_df[['subj', 'drive', 'start_location', 'end_location', 'time_diff_minutes', 'trip_chain']]

In [None]:
import pandas as pd
import numpy as np

# Create a copy of the DataFrame
updated_df = trip_summary_df.copy()

# Identify 'trip_chain' values that appear more than once
trip_chain_counts = updated_df['trip_chain'].value_counts()
trip_chains_to_modify = trip_chain_counts[trip_chain_counts > 1].index

# Iterate through the rows for trip chains with duplicates
for trip_chain in trip_chains_to_modify:
    # Get indices of rows corresponding to the current trip_chain
    indices = updated_df[updated_df['trip_chain'] == trip_chain].index
    # Set the 'time_diff_minutes' value to NaN for the first occurrence only
    updated_df.loc[indices[0], 'time_diff_minutes'] = np.nan

In [None]:
updated_df.head()

In [None]:
# Create a new column 'trip_chain_check' as a copy of 'trip_chain'
updated_df['trip_chain_check'] = updated_df['trip_chain']

# Get the unique values of 'trip_chain' that appear more than once
trip_chain_counts = updated_df['trip_chain'].value_counts()
duplicate_trip_chains = trip_chain_counts[trip_chain_counts > 1].index

# Iterate over duplicate trip chains and process 'time_diff_minutes'
for trip_chain in duplicate_trip_chains:
    # Filter rows for the current trip chain
    rows = updated_df[updated_df['trip_chain'] == trip_chain]
    
    # Check if any 'time_diff_minutes' values are greater than 30 (ignore NaNs)
    if rows['time_diff_minutes'].dropna().gt(30).any():
        # Set 'trip_chain_check' for these rows to NaN
        updated_df.loc[updated_df['trip_chain'] == trip_chain, 'trip_chain_check'] = np.nan



In [None]:
updated_df.head()

In [None]:
updated_df['trip_chain_check'].nunique()

In [None]:
# Ensure the 'trip_chain_check' column exists in updated_df
if 'trip_chain_check' in updated_df.columns:
    # Merge 'trip_chain_check' from updated_df into trip_summary_df based on keys
    trip_summary_df = trip_summary_df.merge(
        updated_df[['subj', 'drive', 'trip_chain', 'trip_chain_check']],
        on=['subj', 'drive', 'trip_chain'],
        how='left'
    )


In [None]:
trip_summary_df['trip_chain_check'].nunique()

In [None]:
# Save the filtered DataFrame to CSV
#trip_summary_df.to_csv('trip_chain_dwell_time.csv', index=False)

## correct 15miles 25miles from home variable (there are some more drives which start from home)

In [None]:
# Update 'label' column wherever 'start_location' contains 'home'
trip_summary_df.loc[trip_summary_df['start_location'].str.contains('home', case=False, na=False), 'label'] = 'home'


In [None]:
import numpy as np

# Create a new column '15_miles_from_home' where the distance is less than 15 miles and label is 'home'
trip_summary_df['15_miles_from_home'] = trip_summary_df.apply(
    lambda row: 'yes' if row['label'] == 'home' and row['distance_miles'] < 15 else ('no' if row['label'] == 'home' else np.nan), axis=1)

# Create a new column '25_miles_from_home' where the distance is less than 25 miles and label is 'home'
trip_summary_df['25_miles_from_home'] = trip_summary_df.apply(
    lambda row: 'yes' if row['label'] == 'home' and row['distance_miles'] < 25 else ('no' if row['label'] == 'home' else np.nan), axis=1)



In [None]:
# Save the filtered DataFrame to CSV
trip_summary_df.to_csv('trip_chain_v2.csv', index=False)