# String Column Concatenation

This notebook demonstrates how to concatenate two string columns in a pandas DataFrame, handling cases where the second column might be empty or contain NaN values.

In [None]:
import pandas as pd
import numpy as np

## 1. Create Sample Data

In [None]:
# Create sample dataframe with string columns where second column might be empty
sample_data = {
    'first_column': ['Apple', 'Banana', 'Cherry', 'Date', 'Elderberry'],
    'second_column': ['Red', '', 'Sweet', None, 'Purple'],  # Empty string and None values
    'other_data': [1, 2, 3, 4, 5]
}

df = pd.DataFrame(sample_data)

print("Original DataFrame:")
print(df)
print(f"\nDataFrame shape: {df.shape}")
print(f"Data types:\n{df.dtypes}")

In [None]:
# Check for empty and null values in the second column
print("Analysis of second_column:")
print(f"Null values: {df['second_column'].isnull().sum()}")
print(f"Empty strings: {(df['second_column'] == '').sum()}")
print(f"Non-empty, non-null values: {((df['second_column'] != '') & df['second_column'].notna()).sum()}")

print("\nUnique values in second_column:")
print(df['second_column'].value_counts(dropna=False))

## 2. Method 1: Simple Concatenation (Basic)

In [None]:
# Simple concatenation - this will include empty strings and convert NaN to 'nan'
df_method1 = df.copy()
df_method1['concatenated_basic'] = df_method1['first_column'] + df_method1['second_column']

print("Method 1 - Simple Concatenation:")
print(df_method1[['first_column', 'second_column', 'concatenated_basic']])

## 3. Method 2: Concatenation with Space Separator

In [None]:
# Concatenation with space separator - handles NaN but not empty strings well
df_method2 = df.copy()
df_method2['concatenated_space'] = df_method2['first_column'] + ' ' + df_method2['second_column']

print("Method 2 - With Space Separator:")
print(df_method2[['first_column', 'second_column', 'concatenated_space']])

## 4. Method 3: Smart Concatenation (Recommended)

In [None]:
# Smart concatenation that handles empty strings and NaN values properly
def smart_concatenate(row, separator=' '):
    """
    Concatenate two columns intelligently:
    - If second column is empty or NaN, return only first column
    - Otherwise, concatenate with separator
    """
    first = str(row['first_column']) if pd.notna(row['first_column']) else ''
    second = str(row['second_column']) if pd.notna(row['second_column']) and row['second_column'] != '' else ''
    
    if second:
        return first + separator + second
    else:
        return first

df_method3 = df.copy()
df_method3['concatenated_smart'] = df_method3.apply(smart_concatenate, axis=1)

print("Method 3 - Smart Concatenation:")
print(df_method3[['first_column', 'second_column', 'concatenated_smart']])

## 5. Method 4: Using str.cat() with Proper Handling

In [None]:
# Using str.cat() method with proper handling of empty values
df_method4 = df.copy()

# First, replace empty strings with NaN for consistent handling
df_method4['second_column_clean'] = df_method4['second_column'].replace('', np.nan)

# Use str.cat() with na_rep='' to handle NaN values
df_method4['concatenated_str_cat'] = df_method4['first_column'].str.cat(
    df_method4['second_column_clean'], 
    sep=' ', 
    na_rep=''
).str.strip()  # Remove trailing spaces

print("Method 4 - Using str.cat():")
print(df_method4[['first_column', 'second_column', 'concatenated_str_cat']])

## 6. Method 5: Vectorized Approach (Fast for Large DataFrames)

In [None]:
# Vectorized approach using numpy.where for better performance
df_method5 = df.copy()

# Create conditions
second_is_empty = (df_method5['second_column'].isna()) | (df_method5['second_column'] == '')

# Use np.where for conditional concatenation
df_method5['concatenated_vectorized'] = np.where(
    second_is_empty,
    df_method5['first_column'],  # If second is empty, use only first
    df_method5['first_column'] + ' ' + df_method5['second_column']  # Otherwise concatenate
)

print("Method 5 - Vectorized Approach:")
print(df_method5[['first_column', 'second_column', 'concatenated_vectorized']])

## 7. Compare All Methods

In [None]:
# Create a comparison dataframe
comparison_df = pd.DataFrame({
    'first_column': df['first_column'],
    'second_column': df['second_column'],
    'method1_basic': df_method1['concatenated_basic'],
    'method2_space': df_method2['concatenated_space'],
    'method3_smart': df_method3['concatenated_smart'],
    'method4_str_cat': df_method4['concatenated_str_cat'],
    'method5_vectorized': df_method5['concatenated_vectorized']
})

print("Comparison of All Methods:")
print(comparison_df)

## 8. Recommended Solution for Your Use Case

In [None]:
# RECOMMENDED: Method 5 (Vectorized) - Fast and handles empty values properly
def concatenate_columns_safe(df, col1, col2, new_col_name, separator=' '):
    """
    Safely concatenate two string columns, handling empty and NaN values
    
    Parameters:
    df: pandas DataFrame
    col1: name of first column
    col2: name of second column  
    new_col_name: name for the new concatenated column
    separator: string to use between columns (default: space)
    
    Returns:
    DataFrame with new concatenated column
    """
    # Create a copy to avoid modifying original
    result_df = df.copy()
    
    # Check if second column is empty or NaN
    second_is_empty = (result_df[col2].isna()) | (result_df[col2] == '')
    
    # Concatenate conditionally
    result_df[new_col_name] = np.where(
        second_is_empty,
        result_df[col1].astype(str),  # Only first column
        result_df[col1].astype(str) + separator + result_df[col2].astype(str)  # Both columns
    )
    
    return result_df

# Apply the recommended solution
final_df = concatenate_columns_safe(df, 'first_column', 'second_column', 'final_result')

print("RECOMMENDED SOLUTION:")
print(final_df[['first_column', 'second_column', 'final_result']])

## 9. Load Your Own Data

In [None]:
# Template for loading your own CSV data
# Uncomment and modify the following lines to use with your data:

# # Load your CSV file
# your_df = pd.read_csv('your_file.csv')
# 
# # Check the data
# print("Your DataFrame:")
# print(your_df.head())
# print(f"\nColumns: {list(your_df.columns)}")
# 
# # Apply concatenation (replace 'col1' and 'col2' with your actual column names)
# result_df = concatenate_columns_safe(your_df, 'col1', 'col2', 'concatenated_column')
# 
# # Save the result
# result_df.to_csv('output_with_concatenated_column.csv', index=False)
# print("\nResult saved to: output_with_concatenated_column.csv")

## 10. Performance Test (Optional)

In [None]:
# Performance test with larger dataset
import time

# Create larger test dataset
large_data = {
    'first_column': ['Text' + str(i) for i in range(10000)],
    'second_column': ['Suffix' + str(i) if i % 3 != 0 else ('' if i % 6 == 0 else None) for i in range(10000)]
}
large_df = pd.DataFrame(large_data)

print(f"Testing with {len(large_df)} rows...")

# Test Method 3 (apply)
start_time = time.time()
result3 = large_df.apply(smart_concatenate, axis=1)
time3 = time.time() - start_time

# Test Method 5 (vectorized)
start_time = time.time()
second_is_empty = (large_df['second_column'].isna()) | (large_df['second_column'] == '')
result5 = np.where(
    second_is_empty,
    large_df['first_column'],
    large_df['first_column'] + ' ' + large_df['second_column']
)
time5 = time.time() - start_time

print(f"\nPerformance Results:")
print(f"Method 3 (apply): {time3:.4f} seconds")
print(f"Method 5 (vectorized): {time5:.4f} seconds")
print(f"Vectorized is {time3/time5:.1f}x faster")