In [19]:
### all of the imports go into requirements!!!

import pandas as pd
import numpy as np
import string
from datetime import timedelta
from nltk.corpus import stopwords
from nltk import word_tokenize
import nltk

nltk.download("punkt")
nltk.download("stopwords")
from ticket_control.params import path_to_data

# Load your existing database into a DataFrame
# Use flexible path so that it works on everyone's environment

# Chris Notes: Functions are applied on data. Not good practice to load the data inside of functions.
data = pd.read_csv(str(path_to_data) + "/database_telegram.csv", low_memory=False)


##Chris Notes: Define the input of functions and declare their datatype.
def data_preprocessing(data: pd.DataFrame):
    # Provide a Doc String why we have this function and what it does in simple terms.
    """This function is the first step in our Datapreprocessing pipeline. It takes the Telegram Database with the columns...."""

    # Notice the .copy() to copy the values
    data = data.copy()

    # replace sender type with str type
    data["sender"] = data["sender"].astype(str)
    
    data["date"] = data["date"].str.strip("+00:00").str[0:16]
    data["date"] = pd.to_datetime(data["date"], errors="coerce")
    # first round of cleaning na/empty strings/...
    data = data[data["text"].notna()]
    data["text"] = data["text"].str.strip()
    data["text"].replace("", np.nan, inplace=True)
    data.dropna(subset=["text"], inplace=True)

    # sorting values by sender & date
    df = data.sort_values(by=["sender", "date"])

    # creating a time difference
    df["time_diff"] = df.groupby("sender")["date"].diff()

    #
    data_clean = (
        df.groupby(["sender", (df["time_diff"] > timedelta(minutes=10)).cumsum()])
        .agg({"text": " ".join, "date": "first"})
        .reset_index()
    )

    data_clean["sender"] = data_clean["sender"].astype(str)
    # Chris Notes: Always import at the start of the Module.
    import re

    def remove_emojis(data):
        emoj = re.compile(
            "["
            "\U0001F600-\U0001F64F"  # emoticons
            "\U0001F300-\U0001F5FF"  # symbols & pictographs
            "\U0001F680-\U0001F6FF"  # transport & map symbols
            "\U0001F1E0-\U0001F1FF"  # flags (iOS)
            "\U00002500-\U00002BEF"  # chinese char
            "\U00002702-\U000027B0"
            "\U00002702-\U000027B0"
            "\U000024C2-\U0001F251"
            "\U0001f926-\U0001f937"
            "\U00010000-\U0010ffff"
            "\u2640-\u2642"
            "\u2600-\u2B55"
            "\u200d"
            "\u23cf"
            "\u23e9"
            "\u231a"
            "\ufe0f"  # dingbats
            "\u3030"
            "]+",
            re.UNICODE,
        )
        return re.sub(emoj, "", data)

    data_clean["text"] = data_clean["text"].apply(lambda x: remove_emojis(str(x)))

    # second round of cleaning na/empty strings/...
    data_clean = data_clean[data_clean["text"].notna()]
    data_clean["text"] = data_clean["text"].str.strip()
    data_clean["text"].replace("", np.nan, inplace=True)
    data_clean.dropna(subset=["text"], inplace=True)

    data_clean = data_clean.drop_duplicates()

    # lowercasing all strings
    data_clean["text"] = data_clean["text"].apply(lambda x: x.lower())

    # generating list of default stop words
    stop_words = set(stopwords.words("german"))

    # add multiple words using 'update'
    new_words_to_add = [
        "männlich",
        "weiblich",
        "gelesen",
        "weste",
        "westen",
        "shirt",
        "pulli",
        "jacke",
        "jacken",
        "ticket",
        "tickets",
        "eingestiegen",
        "ausgestiegen",
        "steigen",
        "schwarze",
        "schwarz",
        "männer",
        "haare",
        "the",
        "stehen",
        "gelesene",
        "blaue",
        "with",
        "wertend",
        "fahrgaesten",
        "fahrgaeste",
        "fahrgästen",
        "fahrgästen",
        "westlichen",
        "warnwesten",
        "gelbwesten",
        "abwertend",
        "blauwesten",
        "fahrgaesten",
        "wertende",
        "besten",
        "nichtwertende",
        "wuetend",
        "wütend",
        "wuetend",
        "wuetenden",
        "genau",
        "sicher",
        "ungenau",
        "sicherheitswesten",
        "westentraeger",
    ]
    stop_words.update(new_words_to_add)

    # Remove unwanted stopwords
    my_wanted_words = ["nach", "bei", "von", "vom" "zum", "über", "bis"]
    final_stopwords = stop_words - set(my_wanted_words)

    # Chris Notes: Not best practice to define functions inside of funcitons. Better to keep the definition separate and call the function within other functions.
    def stopword(text):
        word_tokens = word_tokenize(text)
        text = [
            w for w in word_tokens if not w in final_stopwords
        ]  ## if w isn't in final_stopwords, return w
        return " ".join(text)  ##transforming list into string again

    data_clean["text"] = data_clean["text"].apply(stopword)

    # removing punctuation
    for element in string.punctuation:
        data_clean["text"] = data_clean["text"].str.replace(element, "")

    # third round of cleaning na/empty strings/...
    data_clean["text"] = data_clean["text"].replace("", np.nan)
    data_clean["text"] = data_clean["text"].str.strip()
    data_clean.dropna(subset="text", inplace=True)
    data_clean = data_clean.drop_duplicates(subset="text")
    data_clean = data_clean[data_clean["text"] != ""]
    data_clean.dropna(subset="text", inplace=True)

    # replacing unwanted characters and words
    data_clean["text"] = data_clean["text"].str.replace("ß", "ss")
    data_clean["text"] = data_clean["text"].str.replace("ä", "ae")
    data_clean["text"] = data_clean["text"].str.replace("ö", "oe")
    data_clean["text"] = data_clean["text"].str.replace("ü", "ue")
    data_clean["text"] = data_clean["text"].str.replace("strasse", "str")
    data_clean["text"] = data_clean["text"].str.replace("alexanderplatz", "alex")
    data_clean["text"] = data_clean["text"].str.replace("zoologischer garten", "zoo")
    data_clean["text"] = data_clean["text"].str.replace("kottbusser", "kotti")
    data_clean["text"] = data_clean["text"].str.replace("goerlitzer", "goerli")

    # final sorting
    data_clean = data_clean.sort_values(by=["date", "sender"])

    # converting into "handover" file
    ##Chris Notes: Assign you objects names that indicate their type and state in the process.
    df_for_fuzzy_matching = data_clean.drop("time_diff", axis=1)

    return df_for_fuzzy_matching


[nltk_data] Downloading package punkt to /home/chris/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/chris/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
import pandas as pd
import numpy as np
from datetime import timedelta
from thefuzz import process
from thefuzz import fuzz
import re
from ticket_control.data_preprocessing import data_preprocessing
from ticket_control.params import path_to_data
import matplotlib.pyplot as plt


# Chris Notes: Name your variables after the type of the variable and their purpose in the pipeline.
df_station_mapping = pd.read_csv(
    str(path_to_data) + "/s_u_stations_fixed_with_keys_20230830.csv"
)  # Replace with the path to your database file


# Christ Notes: Better to define the function outside of the function. We want to create individual units of code that are testable and serve a single purpose.
def create_station_to_line_df(df_station_mapping: pd.DataFrame):
    # df = pd.read_csv('s_u_stations_fixed_with_keys_20230830.csv')  # Replace with the path to your database file
    df = df_station_mapping.copy()

    # create a dictionary where U/S bahn line names are the keys and the respective stations are the values incl. lat & lon
    output = {"station_key": [], "line": []}
    for idx, row in df.iterrows():
        line_split = row["lines"].split(", ")
        for i in line_split:
            output["station_key"].append(row["keys"])
            output["line"].append(i)
    station_to_line = pd.DataFrame(output)
    station_to_line = station_to_line.drop_duplicates()
    return station_to_line


def fuzz_flow(df_for_fuzzy_matching: pd.DataFrame, station_to_line: pd.DataFrame):
    # Chris Notes: Always write a short Docstring for your function, describe what it does and what is input and outputs are.
    """Docstring for this function, This function does x,y,z..."""
    # Chris Notes: We want to separate the individual cleaning steps into different functions and states. This makes it easier to track down errors.
    # Better to take the input from yannik and use it as a direct input into your function than to call his function agian.
    data3 = df_for_fuzzy_matching
    data3 = data3.copy()

    # Load STATIONS DATAFRAME
    station_to_line = station_to_line.copy()

    df = pd.read_csv(str(path_to_data) + "/s_u_stations_fixed_with_keys_20230830.csv")

    lines_un = list(station_to_line["line"].unique())
    stations_full = list(df["keys"].values)

    # Chris Notes: Better to be defined outside of function.
    def identify_station_precise(
        some_string, confidence_first=80, confidence_second=90
    ):
        res1 = None
        res2 = None
        if some_string[1][1] > confidence_second:
            res1 = some_string[1][0]
            return some_string[0][0], some_string[1][0]
        elif (
            some_string[0][1] > confidence_first
        ):  # try 79 or 89 and other, better less lines but better quality
            return some_string[0][0]
        return None

    # Chris Notes: Better to be defined outside of function.
    def station_finder(some_string):
        for line in lines_un:
            matches = re.search(r"{line}[^0-9]".format(line=line.lower()), some_string)
            if matches is not None:
                stations = list(
                    station_to_line[station_to_line["line"] == line]["station_key"]
                )
                out = process.extract(
                    some_string, stations, limit=2, scorer=fuzz.partial_ratio
                )
                return identify_station_precise(out, 70, 70)
        out = process.extract(
            some_string, stations_full, limit=2, scorer=fuzz.partial_ratio
        )
        return identify_station_precise(out)

    df_chat = data3[["date"]]

    df_chat["station_key"] = data3["text"].map(station_finder)
    df_chat["text"] = data3["text"]
    df_chat.dropna(subset="station_key", inplace=True)
    full_df = df_chat.merge(df, left_on="station_key", right_on="keys")
    full_df.drop(columns="Unnamed: 0", inplace=True)
    full_df.drop(columns="keys", inplace=True)
    full_df = full_df.sort_index(ascending=True)
    return full_df


In [21]:
data = pd.read_csv(str(path_to_data) + "/database_telegram.csv", low_memory=False)
data = data[data['group'] ==  'website'].iloc[-1:,:]
data

Unnamed: 0.1,Unnamed: 0,group,sender,text,date
118379,0,website,2023-09-04 07:27:37,Heerstraße,2023-09-04 07:27:37


In [22]:
df = data_preprocessing(data)
df

        Unnamed: 0    group               sender        text  \
118379           0  website  2023-09-04 07:27:37  Heerstraße   

                      date  
118379 2023-09-04 07:27:00  


Unnamed: 0,sender,text,date
0,2023-09-04 07:27:37,heerstr,2023-09-04 07:27:00


In [23]:
station_to_line = create_station_to_line_df(df_station_mapping)
df = fuzz_flow(df_for_fuzzy_matching=df, station_to_line=station_to_line)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_chat["station_key"] = data3["text"].map(station_finder)


Unnamed: 0,date,station_key,text,station name,lines,area,latitude,longitude
0,2023-09-04 07:27:00,heerstr,heerstr,Heerstraße,"S3, S9",Westend,52.508611,13.258611


In [24]:
def update_station_colors(from_date: str, to_date: str) -> pd.DataFrame:
    """This functions returns the Dataframe for the Map of the Streamlit App.
    It takes the input preprocessed Database that is filtered on user input date
    range and returns the reports form the relevant time period.
    All reported stations will appear red on the Map.
    The from and to dateformat ,e.g., from_date='2023-08-30 11:55:00'
    to_date='2023-08-30 12:01:00'."""
    # Read data from CSV files
    reports = pd.read_csv(
        str(path_to_main) + "/data/preprocessed_database_telegram.csv"
    )
    stations = pd.read_csv(str(path_to_main) + "/data/datanew_map2.csv")
    # Filter reports based on date
    reports = reports.copy()
    stations = stations.copy()
    reports_filtered = reports[
        (reports["date"] >= from_date) & (reports["date"] <= to_date)
    ]
    print(reports_filtered.iloc[-5:])
    # Loop through unique station names in the filtered reports
    for report_station in reports_filtered["station name"].unique():
        # Update the 'color' column for matching stations to '#FF0000'
        stations.loc[stations["station name"] == report_station, "color"] = "#FF0000"
    return stations

In [25]:
path_to_main = '/home/chris/OneDrive_christoph.bieritz/code/ticket-control-bvg/'

df_filtered_map = update_station_colors(
    from_date="2023-09-01 12:28:00",  # Insert Sliders Dates here!
    to_date="2023-09-10 10:28:00",  # Insert Sliders Dates here!
)

       Unnamed: 0                 date         station_key  \
60389          34  2023-09-01 22:58:00     frankfurter tor   
60390          35  2023-09-02 07:25:00          wollankstr   
60391          36  2023-09-02 22:15:00          hermannstr   
60392          37  2023-09-03 06:22:00  heidelberger platz   
60393          38  2023-09-04 07:27:00             heerstr   

                                                    text        station name  \
60389  m10 frankfurter tor m10 frankfurter tor m10 fr...     Frankfurter Tor   
60390  s wollankstr zweimal gleis s wollankstr zweima...       Wollankstraße   
60391  ring gerade s hermannstr 1x m ring gerade s he...       Hermannstraße   
60392  ringbahn s42 heidelberger platz 2x w ringbahn ...  Heidelberger Platz   
60393                                            heerstr          Heerstraße   

                             lines            area   latitude  longitude  
60389                           U5  Friedrichshain  52.515833  13.45416