# Analyze the usage of the Tool

In [None]:
# all other requirements will be installed with the application
%pip install -q pandas matplotlib seaborn user-agents

In [None]:
import sqlite3, os
from datetime import datetime
import pandas as pd             #db querys with visualization
import matplotlib.pyplot as plt # diagrams
import matplotlib.image as mpimg # images
import seaborn as sns           # Heatmaps etc.
from user_agents import parse   # Split OS. Browser etc.
import math                     #  calc sizes
import src.config as config
from  src.jupyter_helper import showBar, showImageGrid, showImage
#sho untracted output always
pd.set_option("display.max_colwidth", None)

## Open Database Connection

In [None]:
# check that the path is fitting to your config file
# in theory we could also use the config.py to load this information (maybe next version)
config.read_configuration()
connection = sqlite3.connect(config.get_analytics_db_path())
cursor = connection.cursor()

### Read and prepare Session Data
This section must be executed for all analyzation regading the Loaction, Operating Systems, and usage time

Here we first read all Data into the Memory and analyze it then. It's more performant then adHoc Queries but AdHoc might be better if the Server is running in parallel.

In [None]:
query = "select * from tblSessions order by timestamp"
df_sessions = pd.read_sql_query(query, connection)
df_sessions['Datetime'] = pd.to_datetime(df_sessions["Timestamp"])
df_sessions['Day'] = df_sessions["Datetime"].dt.day_name()
print(f"{len(df_sessions)} sessions total")

### Read and Prepare Output and Generations

In [None]:
query = "select * from tblGenerations"
df_generations = pd.read_sql_query(query, connection)
df_generations['Date'] = df_generations["Timestamp"].apply(lambda ts: datetime.strptime(ts,"%Y-%m-%d %H:%M:%S").strftime("%Y%m%d"))
cache_dir = config.get_cache_folder()
df_generations['InputFile'] = df_generations.apply(lambda row: os.path.join(cache_dir,row["Date"],row["Input_SHA1"]+".jpg"),axis=1)
#df_generations['InputFile'] = df_generations.apply(lambda row: os.path.join( datetime.strptime(row["Timestamp"],"%Y-%m-%d %H:%M:%S").strftime("%Y%m%d"),row["Input_SHA1"],".jpg"), axis=1)
print(f"{len(df_generations)} generations total")

### Save the Session data as HTML and CSV

In [None]:
# Export to CSV if required (make sure that the folder exists)
df_sessions.to_csv(f'./.local/sessions-{datetime.now().strftime("%Y%m%d")}.csv', index=False) 
df_sessions.to_html(f'./.local/sessions-{datetime.now().strftime("%Y%m%d")}.html', index=False) 

In [None]:
# Export to CSV & HTML if required (make sure that the folder exists)
df_generations.to_csv(f'./.local/generations-{datetime.now().strftime("%Y%m%d")}.csv', index=False) 
df_generations.to_html(f'./.local/generations-{datetime.now().strftime("%Y%m%d")}.html', index=False) 

## Analyze sources of the Queries

**Filter by date (optional!)**
if it is useful for your analysis

In [None]:
# # Year-Month-Day Hours:Minutes:Seconds
# start_date = "2025-01-01 00:00:00"
# end_date = "2025-12-31 23:59:59"
# query = f"select * from tblSessions where Timestamp between '{start_date}' and '{end_date}'"
# df_sessions = pd.read_sql_query(query, connection)
# df_sessions['Datetime'] = pd.to_datetime(df_sessions["Timestamp"])
# df_sessions['Day'] = df_sessions["Datetime"].dt.day_name()
# df_sessions.tail()

### Where does the Users comes from

#### Prepare Dataset for this section

Hint: it's required to execute the "read an prepare Data" first!



In [None]:
df_grouped_location = df_sessions.groupby(["Continent", "Country", "City"]).size().reset_index(name="SessionCount")
#remove correct NaN for better work with Panda groups
df_grouped_location["Continent"] = df_grouped_location["Continent"].fillna("unkonwn")
df_grouped_location["Country"] = df_grouped_location["Country"].fillna("unkonwn")
df_grouped_location["City"] = df_grouped_location["City"].fillna("unkonwn")
df_grouped_location.head(10)

Sample of AdHoc Query for the same content as below.
Use it only if you need special data or rrlations or if your amount of data is very big

In [None]:
# query = "select Continent, Country, City, count(1) as SessionCount from tblSessions group by Continent, Country, City"
# df_where = pd.read_sql_query(query, connection)
# df_where.head(10)

In [None]:
#Group by Countries
df = df_grouped_location.groupby("Country")["SessionCount"].sum().reset_index()
df = df.sort_values(by="SessionCount", ascending=False)
showBar(df=df,title="Sessiony by Country", x_column="Country", y_column="SessionCount")

Select a country to get more details about the cities

In [None]:
selected_country = "The Netherlands"
df = df_grouped_location[df_grouped_location["Country"]==selected_country]
df = df.sort_values(by="SessionCount", ascending=False)
showBar(df=df,title=f"Sessions by City in {selected_country}",x_column="City")

### Which time accesses the users the system?

In [None]:
query = """
select strftime('%Y-%m-%d', Timestamp) as Date, Count(*) as SessionCount
from tblSessions
Group by Date
Order by Date
"""

df_time_distribution = pd.read_sql_query(query, connection)
df_time_distribution.head()

Day of Week and Time

In [None]:
df_time_distribution = df_sessions
df_time_distribution['Hour'] = df_time_distribution["Datetime"].dt.hour
df_time_distribution = df_sessions.groupby(["Day", "Hour"]).size().reset_index(name="SessionCount")
df_time_distribution.head()

In [None]:
heatmap = df_time_distribution.pivot_table(index="Day", columns="Hour", values="SessionCount", aggfunc="sum", fill_value=0)
ordered_days= ["Monday", "Tuesday","Wednesday","Thursday", "Friday","Saturday","Sunday"]
heatmap = heatmap.reindex(ordered_days)
sns.heatmap(heatmap, cmap="coolwarm", annot=True, fmt="g")
plt.title("Heatmap of Sessions by Hour and Weekday")
plt.xlabel = "Hour of Day"
plt.ylabel = "Day of Week"
plt.tight_layout()
plt.show()

### Browser, Languages und Operating Systems

In [None]:
df = df_sessions.groupby("OS").size().reset_index(name="SessionCount")
df = df.sort_values(by="SessionCount", ascending=False)
showBar(df, "Sessions by OS", "OS", "SessionCount")

How many Mobile Devices

In [None]:
counts = df_sessions['IsMobile'].value_counts()
print(counts)
counts.plot.pie(
    labels=['Desktop', 'Mobile'],  #  1 = Mobile, 0 = Desktop
    autopct='%1.1f%%',  # Format (Percentage)
    startangle=90,  # Start angel
    colors=['lightblue', 'lightgreen']  
)
plt.title("Desktop vs. Mobile devices")
plt.ylabel=""
plt.xlabel=""
plt.show()

Analyze by Languages (important if location is not available)

In [None]:
df = df_sessions.groupby("Language").size().reset_index(name="SessionCount")
df = df.sort_values(by="SessionCount", ascending=False)
showBar(df,
        x_column="Language",
        title="Number of Sessions by Languages")


## Analyze the Images

### Analyze

In [None]:
# latestes entries
df_generations.tail()

Which Style was used

In [None]:
df = df_generations.groupby("Style").size().reset_index(name="SessionCount")
df = df.sort_values(by="SessionCount", ascending=False)
showBar(df,
        x_column="Style",
        title="Usage of Styles")


Average generations per input image

In [None]:
df = df_generations.groupby("Input_SHA1").size().reset_index(name="SessionCount")
average = df['SessionCount'].mean()
print(f"Average generation count per Input is {math.ceil(average)}")
df = df.sort_values(by="SessionCount", ascending=False)
showBar(df,
        x_column="Input_SHA1",
        title="Generations per Image",
        show_x_values=False
        )

Top 5 Source images with amount of generations

In [None]:
df = df_generations.groupby("InputFile").size().reset_index(name="SessionCount")
df = df.sort_values(by="SessionCount", ascending=False)
df = df.head(5)
#print(df["InputFile"].to_string(index=False))
showImageGrid(df,"InputFile",name_column="SessionCount")
df.head(len(df))

## Analyze a dedicated Image or generation Session

Styles and Prompts used by the top generator

**TASK**: copy InputSHA1 from above

In [None]:
SHA1="add-here"
#--------------------
df = df_generations[(df_generations["Input_SHA1"] == SHA1)]
#display(df)
for index, row in df.iterrows():
    print(f"{row["Session"]} - {row["Style"]}\t\t{row["Userprompt"]}")


Find prompts used for an image and relevant images.

**TASK**: Execute the Statement above to generate the table

In [None]:
showImageGrid(
    df=df,
    path_column="Output",
    name_column="Style",
    descr_column="Userprompt"
    )
df.tail()

Find details to the User by using Session 

**TASK**: copy session from output above and add to "Session" variable

In [None]:
#Add here the Session from above
Session = "add-here"

# don't change this
df = df_sessions[df_sessions["Session"]==Session]
df = df[["OS", "Browser", "Language"]]
df.head()

All images uploaded by same author (Session)

**TASK**: copy session from output above and add to "Session" variable

In [None]:
#Add here the Session from above
#Session = "add-here"

# don't change this
where = (df_generations["Session"]==Session)
df = (df_generations[where]).groupby(["Input_SHA1","InputFile"]).size().reset_index(name="SessionCount").sort_values(by="SessionCount", ascending=False)
showImageGrid(df,"InputFile", name_column="Input_SHA1")
df.head(15)

Show all generated image of this user

**TASK**: copy session from output above and add to "Session" variable

In [None]:
#Session = "add-here"

# show all generated images of this user
# don't change this
df = df_generations[df_generations["Session"]== Session]
df = df[["Timestamp", "Style", "Userprompt", "Output"]].sort_values(by="Timestamp")
showImageGrid(
    df=df,
    path_column="Output",
    name_column="Style",
    descr_column="Userprompt"
    )
df.head(len(df))
