In [1]:
!/opt/homebrew/Cellar/jupyterlab/4.4.2_1/libexec/bin/python -m pip install pandas


The system cannot find the path specified.


In [2]:
import pandas as pd
import unicodedata
import re

def normalize_text(text):
    """Normalize Turkish characters and handle case for consistent comparison."""
    if not isinstance(text, str):
        return text
    text = text.lower()
    text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('ASCII')
    mappings = {
        's': ['ş', 's'],
        'i': ['ı', 'i'],
        'g': ['ğ', 'g'],
        'u': ['ü', 'u'],
        'c': ['ç', 'c'],
        'o': ['ö', 'o']
    }
    for target, sources in mappings.items():
        for source in sources:
            text = text.replace(source, target)
    return text.strip()



In [3]:
def clean_station_name(row, station_list, line_map):
    """Clean station name and match exactly to reference list, considering line_name."""
    station = row['station_poi_desc_cd']
    line_name = row['line_name'].upper()
    if not isinstance(station, str):
        return station
    
    explicit_mappings = {
        'KISIKLI': 'Kısıklı',
        'HACIOSMAN': 'Hacıosman',
        'KADIKOY': 'Kadıköy',
        'KADIKOY ': 'Kadıköy',
        ' KADIKOY': 'Kadıköy',
        'ACIBADEM': 'Acıbadem',
        'IMAM HATIP LISESI': 'İmam H. Lisesi',
        'SOGANLIK': 'Soğanlık',
        'ITU GUNEY': 'İTÜ-Ayazağa',
        'SANAYI MAH. GUNEY': 'Sanayi Mahallesi',
        'SISLI GUNEY': 'Şişli-Mecidiyeköy',
        'IHLAMUR KUYU': 'Ihlamurkuyu',
        'AYRILIKCESME': 'Ayrılık Çeşmesi',
        'YAKACIK': 'Yakacık',
        'NECİP FAZIL': 'Necip Fazıl',
        'SAGMALCILAR': 'Sağmalcılar',
        'BAKIRKOY': 'Bakırköy',
        'BAGCILAR MEYDAN': 'Bağcılar Meydan',
        'SANAYI MAH.': 'Sanayi Mahallesi',
        ' SANAYI MAH.': 'Sanayi Mahallesi',
        'SANAYI MAH. ': 'Sanayi Mahallesi'
    }
    
    station_upper = station.upper().strip()
    for key in explicit_mappings:
        if normalize_text(station_upper) == normalize_text(key):
            return explicit_mappings[key]
    
    cleaned = station
    cleaned = re.sub(r'\s*\([^)]*\)', '', cleaned, flags=re.IGNORECASE)
    suffixes = [
        r'\s*(güney|güney|kuzey|batı|bati|doğu|dogu|giriş|giris|çıkış|cikis|\d+)\s*$',
        r'\s*(south|north|west|east|entry|exit|\d+)\s*$'
    ]
    for suffix in suffixes:
        cleaned = re.sub(suffix, '', cleaned, flags=re.IGNORECASE)
    
    normalized_cleaned = normalize_text(cleaned)
    
    candidate_stations = [s for s, l in station_list if l == line_name] if line_name in line_map else [s for s, _ in station_list]
    
    for ref_station in candidate_stations:
        normalized_ref = normalize_text(ref_station)
        if normalized_cleaned == normalized_ref:
            return ref_station
        if '-' in ref_station:
            parts = ref_station.split('-')
            for part in parts:
                if normalize_text(part) == normalized_cleaned:
                    return ref_station
        if normalized_cleaned in normalized_ref or normalized_ref in normalized_cleaned:
            return ref_station
    
    for ref_station, _ in station_list:
        normalized_ref = normalize_text(ref_station)
        if normalized_cleaned == normalized_ref or normalized_cleaned in normalized_ref or normalized_ref in normalized_cleaned:
            return ref_station
    
    return cleaned 



In [4]:
def clean_transport_data(input_file, stations_file, output_file):
    """Main function to clean transportation dataset."""
    
    df = pd.read_csv(input_file)
    stations_df = pd.read_csv(stations_file)
    
    station_list = list(zip(stations_df['Station Name'], stations_df['Line Code'].str.upper()))
    
    line_map = {}
    for station, line in station_list:
        if line not in line_map:
            line_map[line] = []
        line_map[line].append(station)
    
    columns_to_remove = [
        'transport_type_id', 'transfer_type', 'number_of_passage', 
        'product_kind', 'transaction_type_desc', 'town'
    ]
    df = df.drop(columns=[col for col in columns_to_remove if col in df.columns])
    
    df = df[df['road_type'].str.upper() == 'RAYLI']
    
    valid_lines = ['M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'M11']
    df = df[df['line_name'].str.upper().isin(valid_lines)]
    
    df['station_poi_desc_cd'] = df.apply(
        lambda row: clean_station_name(row, station_list, line_map), axis=1
    )
    
    df.to_csv(output_file, index=False)
    print(f"Cleaned data saved to {output_file}")
    
    return df


In [5]:
input_file = "../Data/september2024.csv" 
stations_file = "../Data/metrostationsbyorder.csv"  
output_file = "../Data/output/september2024cleaned_transport_data.csv"  

cleaned_df = clean_transport_data(input_file, stations_file, output_file)
print("Data cleaning completed.")
print(f"Number of records after cleaning: {len(cleaned_df)}")

Cleaned data saved to ../Data/output/september2024cleaned_transport_data.csv
Data cleaning completed.
Number of records after cleaning: 614229
