# Data Analysis

In [41]:
# Load csv
import pandas as pd
data = pd.read_csv('Candles_friends_2025.csv')


In [42]:
# Take only 2025 entries

data['Date'] = pd.to_datetime(data['Date'], dayfirst=True)
data = data[data['Date'] >= '01-01-2025']
data


Unnamed: 0,Date,Time of day,What,Yvet,Wybren,Ada-Mats,Jonas,Valgeir,Arriën,Julian,...,Christian,Alex,Pauline,Eirikur,Helena,Simon M,Victor,Ceci,Simone M,Bastien
1,2025-01-10,Evening,Arriën visits sick Alejandra,,,,,,0.0,,...,,,,,,,,,,
2,2025-01-12,Evening,Dahl dinner winter mode,1.0,1.0,0.0,0.0,0.0,,,...,,,,,,,,,,
3,2025-01-14,Morning,Breakfast with Julian,,,,,,,0.0,...,,,,,,,,,,
4,2025-01-16,Evening,Post climbing,1.0,,,,,,,...,,,,,,,,,,
5,2025-01-23,Evening,Post climbing,1.0,,,,,,,...,,,,,,,,,,
6,2025-01-25,Evening,Dinner Migas+Mousaka,2.0,2.0,,,,,,...,,,,,,,,,,
7,2025-01-28,Evening,Post climbing,2.0,,,,,,,...,,,,,,,,,,
8,2025-02-06,Evening,Post climbing,2.0,2.0,,,,,,...,,,,,,,,,,
9,2025-02-07,Evening,Post Shostakovich concert,,,,,,,,...,,,,,,,,,,
10,2025-02-08,Evening,House warming 1,,,,,,,,...,,,,,,,,,,


## How many candles burnt

In [43]:
# For every numerical column, take the maximum value and print it
numerical_columns = data.select_dtypes(include=['number']).columns
for column in numerical_columns:
    max_value = data[column].max()
    print(f'Maximum value in column {column}: {max_value}')


# Add all the maximum values together and print the result
total_max = sum(data[column].max() for column in numerical_columns)
print(f'Total of maximum values: {total_max}')

# Number of candles x 5.5 hours/ candle
total_candles = total_max
total_hours = total_candles * 5.5
print(f'Total hours of candle burning: {total_hours} hours')

Maximum value in column Yvet: 5.0
Maximum value in column Wybren: 4.0
Maximum value in column Ada-Mats: 1.0
Maximum value in column Jonas: 0.0
Maximum value in column Valgeir: 1.0
Maximum value in column Arriën: 2.0
Maximum value in column Julian: 0.0
Maximum value in column Sophia: 0.0
Maximum value in column Jakub: 4.0
Maximum value in column Rhea: 1.0
Maximum value in column Augustin: 0.0
Maximum value in column Akhil-Shaima: 1.0
Maximum value in column Sayantan-Annali: 1.0
Maximum value in column Sanser: 1.0
Maximum value in column Kuba: 0.0
Maximum value in column Simon salsa: 0.0
Maximum value in column Arzu: 0.0
Maximum value in column Weeyarut: 0.0
Maximum value in column Sebastian: 0.0
Maximum value in column Christian: 0.0
Maximum value in column Alex: 0.0
Maximum value in column Pauline: 2.0
Maximum value in column Eirikur: 0.0
Maximum value in column Helena: 0.0
Maximum value in column Simon M: 0.0
Maximum value in column Victor: 0.0
Maximum value in column Ceci: 0.0
Maximu

## Type of events

In [44]:
# Events are the values in the 'What' column. Make a list of non unique events
events = data['What'].tolist()
print(f'List of events: {events}')

List of events: ['Arriën visits sick Alejandra', 'Dahl dinner winter mode', 'Breakfast with Julian', 'Post climbing', 'Post climbing', 'Dinner Migas+Mousaka', 'Post climbing', 'Post climbing', 'Post Shostakovich concert', 'House warming 1', 'Breakfast with Julian', 'Gnocchi pumpkin dinner', 'Jakub at home', 'Goodbye Rhea', 'Breakfast with Julian', 'After sports', 'Sunday family lunch', 'Post climbing + Alex goodbye', 'Risotto with Sara and Ana', 'Post climbing', 'Post climbing', 'Post La Femme chilling', 'Masive brunch', 'Post climbing', 'Post climbing couples', 'Pauline visits', 'Sunday lasagna + karaoke before summer holidays', 'Goodbye summer', 'Family lunch', 'Italian dinner', 'Reopening winter mode with risotto', 'After boulder bourguignon', 'Dinner with Ada with focaccia', "Giulio's 30 birthday"]


In [45]:
from collections import Counter

# Combine all events into one string and split into words
all_text = ' '.join(events).lower()
words = all_text.split()

# Count word frequency
word_frequency = Counter(words)
print(f'\nWord frequency:\n{word_frequency.most_common(20)}')  # Top 20 words


Word frequency:
[('post', 11), ('climbing', 9), ('with', 7), ('dinner', 5), ('breakfast', 3), ('julian', 3), ('goodbye', 3), ('visits', 2), ('winter', 2), ('mode', 2), ('after', 2), ('sunday', 2), ('family', 2), ('lunch', 2), ('+', 2), ('risotto', 2), ('summer', 2), ('arriën', 1), ('sick', 1), ('alejandra', 1)]


In [46]:
import nltk
from nltk.util import ngrams

# Extract and count bigrams (2-word sequences)
bigrams = list(ngrams(words, 2))
bigram_frequency = Counter(bigrams)
print(f'\nBigram frequency:\n{bigram_frequency.most_common(20)}')  # Top 20 bigrams


Bigram frequency:
[(('post', 'climbing'), 9), (('climbing', 'post'), 6), (('breakfast', 'with'), 3), (('with', 'julian'), 3), (('winter', 'mode'), 2), (('family', 'lunch'), 2), (('arriën', 'visits'), 1), (('visits', 'sick'), 1), (('sick', 'alejandra'), 1), (('alejandra', 'dahl'), 1), (('dahl', 'dinner'), 1), (('dinner', 'winter'), 1), (('mode', 'breakfast'), 1), (('julian', 'post'), 1), (('climbing', 'dinner'), 1), (('dinner', 'migas+mousaka'), 1), (('migas+mousaka', 'post'), 1), (('post', 'shostakovich'), 1), (('shostakovich', 'concert'), 1), (('concert', 'house'), 1)]


## Time analysis

In [47]:
# Events per month
data['Month'] = data['Date'].dt.to_period('M')
events_per_month = data.groupby('Month').size()
print(f'\nEvents per month:\n{events_per_month}')

# Events per day of the week. Print the counts for each day and the date names
events_per_day = data['Date'].dt.day_name().value_counts()
print(f'\nEvents per day of the week:\n{events_per_day}')


Events per month:
Month
2025-01    7
2025-02    7
2025-03    4
2025-04    3
2025-05    5
2025-07    1
2025-09    2
2025-10    1
2025-11    4
Freq: M, dtype: int64

Events per day of the week:
Date
Thursday     9
Tuesday      7
Friday       6
Sunday       6
Saturday     5
Wednesday    1
Name: count, dtype: int64


In [48]:
# Time of the day counts (morning, afternoon, evening, night)
events_per_daytime = data['Time of day'].value_counts()
print(f'\nEvents per time of day:\n{events_per_daytime}')


Events per time of day:
Time of day
Evening      27
Morning       3
Afternoon     3
Name: count, dtype: int64


## The day that most candles were burning


In [49]:
# Identify the row with the minumum number of Nans
min_nans_row = data.isna().sum(axis=1).idxmin()
print(f'\nRow with minimum NaNs:\n{data.loc[min_nans_row]}')

# How many non Nan values are there in that row within the numerical columns
non_nan_count = data.loc[min_nans_row, numerical_columns].count()
print(f'Number of non-NaN values in that row within numerical columns = How many candles were burning: {non_nan_count}')


Row with minimum NaNs:
Date                2025-11-15 00:00:00
Time of day                     Evening
What               Giulio's 30 birthday
Yvet                                0.0
Wybren                              0.0
Ada-Mats                            0.0
Jonas                               NaN
Valgeir                             NaN
Arriën                              0.0
Julian                              NaN
Sophia                              NaN
Jakub                               NaN
Rhea                                NaN
Augustin                            NaN
Akhil-Shaima                        0.0
Sayantan-Annali                     0.0
Sanser                              0.0
Kuba                                0.0
Simon salsa                         NaN
Arzu                                NaN
Weeyarut                            NaN
Sebastian                           0.0
Christian                           0.0
Alex                                NaN
Pauline         

## How many people have participated in this

In [50]:
# How many numberical columns are there in total and what are their names
total_numerical_columns = len(numerical_columns)
print(f'Total number of numerical columns: {total_numerical_columns}')
print(f'Names of numerical columns: {numerical_columns}')

Total number of numerical columns: 29
Names of numerical columns: Index(['Yvet', 'Wybren', 'Ada-Mats', 'Jonas', 'Valgeir', 'Arriën', 'Julian',
       'Sophia', 'Jakub', 'Rhea', 'Augustin', 'Akhil-Shaima',
       'Sayantan-Annali', 'Sanser', 'Kuba', 'Simon salsa', 'Arzu', 'Weeyarut',
       'Sebastian', 'Christian', 'Alex', 'Pauline', 'Eirikur', 'Helena',
       'Simon M', 'Victor', 'Ceci', 'Simone M', 'Bastien'],
      dtype='object')


In [51]:
# First row of the dataframe.
first_row = data.iloc[0]
print(f'\nFirst row of the dataframe:\n{first_row}')

# Last row of the dataframe.
last_row = data.iloc[-1]
print(f'\nLast row of the dataframe:\n{last_row}')


First row of the dataframe:
Date                        2025-01-10 00:00:00
Time of day                             Evening
What               Arriën visits sick Alejandra
Yvet                                        NaN
Wybren                                      NaN
Ada-Mats                                    NaN
Jonas                                       NaN
Valgeir                                     NaN
Arriën                                      0.0
Julian                                      NaN
Sophia                                      NaN
Jakub                                       NaN
Rhea                                        NaN
Augustin                                    NaN
Akhil-Shaima                                NaN
Sayantan-Annali                             NaN
Sanser                                      NaN
Kuba                                        NaN
Simon salsa                                 NaN
Arzu                                        NaN
Weeyarut   

## Person with most candles burnt. And who has been there only once

In [55]:
# from the numerical columns, find the column with the highest value and print its name and value
max_column = None
max_value = float('-inf')
for column in numerical_columns:
    column_max = data[column].max()
    if column_max > max_value:
        max_value = column_max
        max_column = column
print(f'\nColumn with highest value: {max_column} with value {max_value}')
print(f'\nCorresponding to {max_value*5.5} hours of candle burning.')


Column with highest value: Yvet with value 5.0

Corresponding to 27.5 hours of candle burning.


In [54]:
# Find the people (numerical columns) who have been only in one event (one row) and print their names.
# That means, find the numerical columns that have always Nan except for one row.
for column in numerical_columns:
    non_nan_rows = data[column].dropna()
    if len(non_nan_rows) == 1:
        print(f'{column} has been only in one event.')


Jonas has been only in one event.
Sophia has been only in one event.
Augustin has been only in one event.
Simon salsa has been only in one event.
Arzu has been only in one event.
Weeyarut has been only in one event.
Alex has been only in one event.
Victor has been only in one event.
Simone M has been only in one event.
Bastien has been only in one event.


## Person with most events

In [None]:
# In the numerical columns, how many non values per column are there?
for column in numerical_columns:
    number_of_events = data[column].count()
    print(f'Column {column} has {number_of_events} number of events.')


Time of day: Evening
  Column Yvet has 14 number of events.
  Column Wybren has 7 number of events.
  Column Ada-Mats has 5 number of events.
  Column Jonas has 1 number of events.
  Column Valgeir has 2 number of events.
  Column Arriën has 6 number of events.
  Column Julian has 0 number of events.
  Column Sophia has 0 number of events.
  Column Jakub has 7 number of events.
  Column Rhea has 2 number of events.
  Column Augustin has 1 number of events.
  Column Akhil-Shaima has 2 number of events.
  Column Sayantan-Annali has 2 number of events.
  Column Sanser has 2 number of events.
  Column Kuba has 2 number of events.
  Column Simon salsa has 1 number of events.
  Column Arzu has 1 number of events.
  Column Weeyarut has 1 number of events.
  Column Sebastian has 2 number of events.
  Column Christian has 2 number of events.
  Column Alex has 1 number of events.
  Column Pauline has 4 number of events.
  Column Eirikur has 3 number of events.
  Column Helena has 2 number of ev

In [None]:
# Do the same filtered by time of day
for time_of_day in data['Time of day'].unique():
    print(f'\nTime of day: {time_of_day}')
    filtered_data = data[data['Time of day'] == time_of_day]
    for column in numerical_columns:
        number_of_events = filtered_data[column].count()
        print(f'  Column {column} has {number_of_events} number of events.')


Time of day: Evening
  Column Yvet has 14 number of events.
  Column Wybren has 7 number of events.
  Column Ada-Mats has 5 number of events.
  Column Jonas has 1 number of events.
  Column Valgeir has 2 number of events.
  Column Arriën has 6 number of events.
  Column Julian has 0 number of events.
  Column Sophia has 0 number of events.
  Column Jakub has 7 number of events.
  Column Rhea has 2 number of events.
  Column Augustin has 1 number of events.
  Column Akhil-Shaima has 2 number of events.
  Column Sayantan-Annali has 2 number of events.
  Column Sanser has 2 number of events.
  Column Kuba has 2 number of events.
  Column Simon salsa has 1 number of events.
  Column Arzu has 1 number of events.
  Column Weeyarut has 1 number of events.
  Column Sebastian has 2 number of events.
  Column Christian has 2 number of events.
  Column Alex has 1 number of events.
  Column Pauline has 4 number of events.
  Column Eirikur has 3 number of events.
  Column Helena has 2 number of ev

## Longer stays

In [None]:
# For each numerical column, divide its maximum value by the total number of events in that column and print the result
for column in numerical_columns:
    max_value = data[column].max()
    event_count = data[column].count()
    if event_count > 0:
        ratio = max_value / event_count
        print(f'Column {column}: max value {max_value} / number of events {event_count} = {ratio}')
    else:
        print(f'Column {column} has no events.')

Column Yvet: max value 5.0 / number of events 16 = 0.3125
Column Wybren: max value 4.0 / number of events 9 = 0.4444444444444444
Column Ada-Mats: max value 1.0 / number of events 6 = 0.16666666666666666
Column Jonas: max value 0.0 / number of events 1 = 0.0
Column Valgeir: max value 1.0 / number of events 3 = 0.3333333333333333
Column Arriën: max value 2.0 / number of events 6 = 0.3333333333333333
Column Julian: max value 0.0 / number of events 4 = 0.0
Column Sophia: max value 0.0 / number of events 1 = 0.0
Column Jakub: max value 4.0 / number of events 10 = 0.4
Column Rhea: max value 1.0 / number of events 2 = 0.5
Column Augustin: max value 0.0 / number of events 1 = 0.0
Column Akhil-Shaima: max value 1.0 / number of events 3 = 0.3333333333333333
Column Sayantan-Annali: max value 1.0 / number of events 3 = 0.3333333333333333
Column Sanser: max value 1.0 / number of events 3 = 0.3333333333333333
Column Kuba: max value 0.0 / number of events 2 = 0.0
Column Simon salsa: max value 0.0 / n