In [None]:
import pandas as pd
import json
import numpy as np
from matplotlib import pyplot as plt
import os

In [None]:
 general_path = 'D:/DATA_THESIS/LarField'
 folder_name = 'userData'
path_to_folder = 'D:/DATA_THESIS/Projekt_nw_AG_AJ/Outputs_project_data_preprocessing'


In [None]:
def read_csv_from_dir(dir_name):
    """
    Function for loading user_data from directory.

    :param dir_name: directory with all user_data files, e.g., activity
    :return: DataFrame with all files sorted by timestamp (or queryStart)
    """
    df = pd.DataFrame()
    for filename in os.listdir(dir_name):
        df_temp = pd.read_csv(os.path.join(dir_name, filename), sep="\t")
        df = pd.concat([df, df_temp])
    try:
        dates = [ts.split("T")[0] for ts in df["timestamp"]]
        times = [ts.split("T")[1] for ts in df["timestamp"]]
    except:
        dates = [ts.split("T")[0] for ts in df["queryStart"]]
        times = [ts.split("T")[1] for ts in df["queryStart"]]

    df["date"] = dates
    df["time"] = times
    try:
        return df.sort_values(by="timestamp").reset_index(drop=True)
    except:
        return df.sort_values(by="queryStart").reset_index(drop=True)

In [None]:
def location_analysis(df):
    # Group by date and location, count the occurrences of each combination
    df_grouped = df.groupby(by=['date', 'location']).size().reset_index(name='count')
    # Group by date, sum the counts for each day
    df_grouped = df_grouped.groupby('date')['count'].sum().reset_index()
    return df_grouped

In [None]:
def deviceinfo_analysis(df):
    # Group by date and location, count the occurrences of each combination
    for i in range(0, len(list(df['localizationEnabled']))):
        if df['localizationEnabled'][i] ==False: 
            print("FALSE: ", df['timestamp'][i])
            try:
                if df['localizationEnabled'][i+1] ==True:
                    print("\tNEXT ON: ", df['timestamp'][i+1])
            except:
                print('End')
    # Group by date, sum the counts for each day
    # return

In [None]:
# read all the locations
all_locations = pd.DataFrame()
skipped = 0
for iter in  os.listdir(general_path):
    for user in os.listdir(f'{general_path}/{iter}/{folder_name}'):
        # user_name = []
        # df_location_for_user = pd.DataFrame()
        if 'location' in os.listdir(f'{general_path}/{iter}/{folder_name}/{user}'):
            location_csv = read_csv_from_dir(f'{general_path}/{iter}/{folder_name}/{user}/location')

            location_csv['patient_id'] = np.repeat(user,len(location_csv))
            all_locations = pd.concat([all_locations, location_csv], axis = 0)
        else:
            print("No LOCATION folder, skip id: ", user, 'iter: ', iter)
            skipped+=1


# 'D:/DATA_THESIS/LarField/i_01/userData/0Uvl9keeDVbkh69satbc6wTSbe23/location'

In [None]:
# Count the locations during the day
all_locations_stats = pd.DataFrame()
skipped = 0
for iter in  os.listdir(general_path):
    for user in os.listdir(f'{general_path}/{iter}/{folder_name}'):
        # user_name = []
        # df_location_for_user = pd.DataFrame()
        if 'location' in os.listdir(f'{general_path}/{iter}/{folder_name}/{user}'):
            location_csv = read_csv_from_dir(f'{general_path}/{iter}/{folder_name}/{user}/location')
            only_stats = location_analysis(location_csv)
            only_stats['patient_id'] = np.repeat(user,len(only_stats))
            all_locations_stats = pd.concat([all_locations_stats, only_stats], axis = 0)
        else:
            print("No LOCATION folder, skip id: ", user, 'iter: ', iter)
            skipped+=1


# 'D:/DATA_THESIS/LarField/i_01/userData/0Uvl9keeDVbkh69satbc6wTSbe23/location'

In [None]:
# Count the deviceInfo during the day
all_deviceinfo_stats = pd.DataFrame()
skipped = 0
for iter in  os.listdir(general_path):
    for user in os.listdir(f'{general_path}/{iter}/{folder_name}'):
        # user_name = []
        # df_location_for_user = pd.DataFrame()
        if 'deviceInfo' in os.listdir(f'{general_path}/{iter}/{folder_name}/{user}'):
            deviceinfo_csv = read_csv_from_dir(f'{general_path}/{iter}/{folder_name}/{user}/deviceinfo')
            print(iter, user)
            only_stats = deviceinfo_analysis(deviceinfo_csv)
            # only_stats['patient_id'] = np.repeat(user,len(only_stats))
            # all_deviceinfo_stats = pd.concat([all_deviceinfo_stats, only_stats], axis = 0)
        else:
            print("No deviceinfo folder, skip id: ", user, 'iter: ', iter)
            skipped+=1



In [None]:
deviceinfo_csv

In [None]:
skipped

In [None]:
all_locations_stats =all_locations_stats.reset_index(drop = True)

In [None]:
all_locations_stats

# SAVE

In [None]:
all_locations_stats.to_pickle(f'{path_to_folder}/5_locations/number_of_locations_for_each_day.pkl')

In [None]:
all_locations

# Read data with H3 and split to longitude and latitude

In [None]:
import h3
from shapely.geometry import Point
import geopandas as gpd
import geoplot as gplt

def convert_to_coordinates(dataframe= all_locations):
    dataframe['geometry'] = [h3.h3_to_geo(h) for h in dataframe['location']]
    # Extract latitude and longitude into separate columns
    # dataframe[['lon', 'lat']] = dataframe['geometry'].str.extract(r'\((-?\d+\.\d+) (-?\d+\.\d+)\)')
    dataframe['lon'] = dataframe['geometry'].apply(lambda x: x[0])
    dataframe['lat'] = dataframe['geometry'].apply(lambda x: x[1])
    dataframe = dataframe.drop(['geometry'], axis=1)
    return dataframe

In [None]:
df_with_geodata = convert_to_coordinates(all_locations)

In [None]:
df_with_geodata = df_with_geodata.reset_index(drop = True)
df_with_geodata

# Plot results 

In [None]:
# Create a GeoDataFrame with a "geometry" column containing Point geometries
def plot_location(df_all = df_with_geodata):
    # df_all = df_all[df_all['patient_id'] == patient_id]
    geometry = [Point(lon, lat) for lon, lat in zip(df_all['lon'], df_all['lat'])]
    geo_df = gpd.GeoDataFrame(df_all, geometry=geometry, crs="EPSG:4326")

    
    ax = gplt.pointplot(
        geo_df,
        # hue='time',  # Specify the column for coloring
        cmap='viridis',  # Choose a colormap
        legend=True,  # Show legend
        # legend_var='hue',  # Specify that legend is based on 'user_column'
        legend_kwargs={'bbox_to_anchor': (1, 1), 'title': 'User Column'},  # Adjust legend position and title
    )
    
    ax.set_title("Track of 1 patient", fontsize=16)
    
    plt.show()

In [None]:
one_patient = df_with_geodata[df_with_geodata['patient_id'] == (df_with_geodata['patient_id']).unique()[81]]
one_patient_one_day = one_patient[one_patient['date'] == (one_patient['date']).unique()[5]]
plot_location(one_patient_one_day)

In [None]:
one_patient_one_day

In [None]:
import geopandas as gpd
import geoplot.crs as gcrs
import matplotlib.pyplot as plt
from shapely.geometry import Point

# Create a GeoDataFrame with a "geometry" column containing Point geometries
def plot_location(df_all=df_with_geodata):
    # Assuming df_with_geodata is your DataFrame containing latitude and longitude columns
    geometry = [Point(lon, lat) for lon, lat in zip(df_all['lon'], df_all['lat'])]
    geo_df = gpd.GeoDataFrame(df_all, geometry=geometry, crs="EPSG:4326")

    # Load shapefile or GeoDataFrame for Poland (replace 'your_poland_data.shp' with the actual file or data)
    poland_data = gpd.read_file('poland_administrative_boundaries_level6_counties_polygon.geojson')

    # Set the width and height
    width = 10
    height = 8

    # Create a new figure with specified width and height
    plt.figure(figsize=(width, height))
    poland_data.plot()
    # Plot Poland administrative boundaries
    gplt.polyplot(poland_data, projection=gcrs.PlateCarree(), figsize=(width, height))

    # Plot locations on top of the map
    gplt.pointplot(geo_df)

    # Customize plot properties
    plt.title("Locations of Patients in Poland")
    plt.xlabel("Longitude")
    plt.ylabel("Latitude")

    plt.show()



In [None]:
plot_location(one_patient_one_day)

# Count statistics

In [None]:
def location_analysis(df = location_csv):
    df_grouped = df.groupby(by=["date", "location"]).count().reset_index()[["date","location","timestamp"]]
    
    df_grouped.columns = ["date","location","count"]
    return df_grouped

In [None]:
loc_1_pr = location_analysis(one_patient)
loc_1_pr

In [None]:
len(loc_1_pr['location'].unique())

In [None]:
def location_analysis(df = location_csv):
    df_grouped = df.groupby(by=[ 'date', "location"]).count().reset_index()[["date","location","timestamp"]]
    
    df_grouped.columns = ['date',"location","count" ]
    return df_grouped

In [None]:
def location_analysis(df = location_csv):
    df_grouped = df.groupby(by=["location"]).count().reset_index()[["date","location"]]
    
    df_grouped.columns = ["location","count"]
    return df_grouped

In [None]:
def location_analysis(df):
    # Group by date and location, count the occurrences of each combination
    df_grouped = df.groupby(by=['date', 'location']).size().reset_index(name='count')
    # Group by date, sum the counts for each day
    df_grouped = df_grouped.groupby('date')['count'].sum().reset_index()
    return df_grouped

In [None]:
loc_1_pr = location_analysis(one_patient)
loc_1_pr.sort_values('count')

In [None]:
loc_1_pr

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

def plot_most_visited_place(one_person_data):
    # Group by location and count the number of occurrences of each location
    location_counts = one_person_data.groupby('location').size().reset_index(name='count')
    
    # Find the location with the highest count
    most_visited_place = location_counts.loc[location_counts['count'].idxmax(), 'location']
    print("Most visited place:", most_visited_place)
    
    # Filter the DataFrame to get only the rows where the person was at the most visited place
    person_most_visited_place = one_person_data[one_person_data['location'] == most_visited_place]
    
    # Plot the hours in the day when the person was at the most visited place
    plt.figure(figsize=(10, 6))
    plt.hist(person_most_visited_place['timestamp'].dt.hour, bins=24, color='skyblue', edgecolor='black', align='left')
    plt.xlabel('Hour of the Day')
    plt.ylabel('Frequency')
    plt.title(f'Hours in the Day at {most_visited_place}')
    plt.xticks(range(0, 24))
    plt.grid(True, axis='y')
    plt.show()

# Sample DataFrame containing person's data
# Replace this with your actual DataFrame for the specified person
person_id = 1  # Specify the person's ID
one_person_data = all_locations[all_locations['patient_id'] == (all_locations['patient_id']).unique()[111]]

# Call the function to plot the most visited place and the hours in the day at that place
plot_most_visited_place(one_person_data)
