# Fuzz Flow

In [1]:
import pandas as pd
import numpy as np
from datetime import timedelta

!pip install thefuzz
from thefuzz import process
from thefuzz import fuzz
import re


import matplotlib.pyplot as plt




In [2]:
# Load TELEGRAM DATASET
data3 = pd.read_csv('from_yannik/data_handover_for_team.csv') # insert path
# Notice the .copy() to copy the values 
data3 = data3.copy()

# Load STATIONS DATAFRAME
df = pd.read_csv('s_u_stations_fixed_with_keys_20230830.csv')  # Replace with the path to your database file
# Notice the .copy() to copy the values 
df = df.copy()
stations_full = list(df['keys'].values)

In [3]:
# create a dictionary where U/S bahn line names are the keys and the respective stations are the values incl. lat & lon
output = {'station_key': [], 'line': []}
for idx,row in df.iterrows():
    line_split = row['lines'].split(', ')    
    for i in line_split:
        output['station_key'].append(row['keys'])
        output['line'].append(i)
station_to_line = pd.DataFrame(output)
station_to_line = station_to_line.drop_duplicates()

lines_un = list(station_to_line['line'].unique())

In [4]:
def identify_station_precise(some_string, confidence_first=80, confidence_second=90):
    res1 = None
    res2 = None
    if some_string[1][1] > confidence_second:
        res1 = some_string[1][0]
        return some_string[0][0], some_string[1][0]
    elif some_string[0][1] > confidence_first: #try 79 or 89 and other, better less lines but better quality
        return some_string[0][0]
    return None

In [5]:
def station_finder(some_string):
    for line in lines_un:
        matches = re.search(r'{line}[^0-9]'.format(line=line.lower()), some_string)
        if matches is not None:
            stations = list(station_to_line[station_to_line["line"]==line]["station_key"])
            out = process.extract(some_string, stations, limit=2, scorer=fuzz.partial_ratio)
            return identify_station_precise(out, 70, 70)
    out = process.extract(some_string, stations_full, limit=2, scorer=fuzz.partial_ratio)
    return identify_station_precise(out)

In [6]:
df_chat = data3[["date"]]

In [7]:
df_chat["station_key"] = data3["text"].map(station_finder)

Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '̈']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_chat["station_key"] = data3["text"].map(station_finder)


In [8]:
df_chat["text"] = data3["text"]
df_chat.dropna(subset="station_key", inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_chat["text"] = data3["text"]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_chat.dropna(subset="station_key", inplace = True)


In [9]:
df_chat.tail(30)

Unnamed: 0,date,station_key,text
138710,2023-08-18 07:55:24,"(rudow, hermannplatz)",u7 hermanplazy 4 controller the rudow direction
138711,2023-08-18 08:19:21,"(alex, strausberg)",u 5 strausberger platz richtung alex
138712,2023-08-18 08:21:51,"(innsbrucker platz, suedkreuz)",innsbruker platz s42 direction südkreuz 2 more...
138713,2023-08-18 08:24:50,hauptbahnhof,u5 linden richtung hauptbahnhof blauweissen b...
138714,2023-08-18 08:29:37,platz der luftbruecke,u6 gerade platz luftbrücke 3x bos
138715,2023-08-18 08:30:19,tempelhof,s41 richtung tempelhof poc normal angezogen
138716,2023-08-18 08:42:45,"(mehringdamm, rathaus spandau)",3 menschen uniform bahnsteig mehringdamm u6 ri...
138717,2023-08-18 08:51:48,"(mahlsdorf, westkreuz)",zwei kontrolletis s5 nach westkreuz gerade ne...
138718,2023-08-18 08:58:07,"(kaulsdorf nord, weberwiese)",u5 richtung kaulsdorf nord gerade weberwiese ...
138719,2023-08-18 08:58:24,"(parchimer allee, grenzallee)",2 bos u7 parchimer allee


In [10]:
full_df = df_chat.merge(df, left_on="station_key", right_on="keys")

In [11]:
full_df = full_df.set_index("date")

In [12]:
full_df.drop(columns="Unnamed: 0", inplace=True)
full_df.drop(columns="keys", inplace=True)

In [15]:
full_df = full_df.sort_index(ascending=True)

In [16]:
full_df.tail(40)

Unnamed: 0_level_0,station_key,text,station name,lines,area,latitude,longitude
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2023-08-17 12:35:11,landsberger allee,1424 uhr weitergeleitet landsberger allee r...,Landsberger Allee,"S41, S42, S8, S85",PrenzlauerBerg,52.529444,13.454722
2023-08-17 12:44:50,wuhletal,s u wuhletal 2 leute bos,Wuhletal,"U5, S5",Kaulsdorf,52.5125,13.575
2023-08-17 12:49:36,karlshorst,2x s3 richtung stadtmitte gerade karlshorst e...,Karlshorst,S3,Karlshorst,52.480833,13.525833
2023-08-17 13:11:26,hauptbahnhof,zwei kontors jz hauptbahnhof cityline,Hauptbahnhof,"U5, S3, S5, S7, S9",Moabit,52.525,13.369444
2023-08-17 13:20:38,noeldnerplatz,gerade nöldnerplatz ausgestiegen 2 frauen auf...,Nöldnerplatz,"S5, S7, S75",Rummelsburg,52.503889,13.485278
2023-08-17 13:50:14,bellevue,sbahn bellevue 3 people,Bellevue,"S3, S5, S7, S9",Hansaviertel,52.52,13.347778
2023-08-17 14:07:39,hermannstr,kontrolle sbahn hermannstrasse,Hermannstraße,"U8, S41, S42, S45, S46, S47",Neukölln,52.4675,13.43125
2023-08-17 14:54:33,ostkreuz,neukölln station towards ostkreuz one guy wit...,Ostkreuz,"S3, S41, S42, S5, S7, S75, S8, S85",Friedrichshain,52.503056,13.468889
2023-08-17 15:01:12,ostkreuz,gruppe polizei oben bei ostkreuz,Ostkreuz,"S3, S41, S42, S5, S7, S75, S8, S85",Friedrichshain,52.503056,13.468889
2023-08-17 15:01:31,tempelhof,falls wer gerade nähe s tempelhof 4 db sicher...,Tempelhof,"U6, S41, S42, S45, S46",Tempelhof,52.470833,13.385278


In [18]:
full_df.to_csv("./data_20230830.csv")