In [2]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
import json
import os
print("Modules imported!")

Modules imported!


In [3]:
def read_json(file_path):
    if os.path.exists(file_path):
        with open(file_path, "r", encoding="utf8") as file:
            data = json.load(file)
        print(f"Data read from path: {file_path}")
        return data
    else:
        print(f"No data found at path: {file_path}")
        return {}

def serialize_json(filename, data):
    with open(filename, "w", encoding="utf8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
        print(f"Data serialized to path: {filename}")

In [4]:
# Data correction: there was a bug in the given framework causing timestamps in data_try_{n}.json to be wrong

# for workerDir in os.scandir('../Data/'):
#     data_try = read_json(workerDir.path + "/data_try_1.json")
#     for workerJson in os.scandir(workerDir.path):
#         if workerJson.name.startswith('quest') or workerJson.name.startswith('doc'):
#             sourceJson = read_json(workerJson.path)
#             if workerJson.name.startswith('quest'):
#                 data_try['timestamps_end'][0][0] = sourceJson['timestamps_end'][0]
#                 data_try['timestamps_elapsed'][0][0] = sourceJson['timestamps_elapsed']
#             else:
#                 workerJson_split = workerJson.name.split('_')
#                 docIndex = int(workerJson_split[1])
#                 docTry = int(workerJson_split[5])
#                 data_try['timestamps_end'][docIndex][docTry - 1] = sourceJson['timestamps_end'][docTry - 1]
#                 data_try['timestamps_elapsed'][docIndex][docTry - 1] = sourceJson['timestamps_elapsed']
#     serialize_json(workerDir.path + "/data_try_1.json", data_try)

In [5]:
# Worker analysis
workerInfoList = []
for workerDir in os.scandir('../Data/'):
    task_data = read_json(workerDir.path + "/task_data.json")
    data_try = read_json(workerDir.path + "/data_try_1.json")
    questionnaire = task_data['questionnaires'][0]['questions']
    reformatQuest = {}
    for question in questionnaire:
        reformatQuest[question['name']] = question['answers']
    device = task_data["worker"]['ngxDeviceDetectorProperties']["device"]
    if device == 'Unknown':
        device = task_data["worker"]['ngxDeviceDetectorProperties']["os"]
    workerInfo = {}
    for key, value in data_try['questionnaires_answers'][0].items():
        workerInfo[key] = reformatQuest[key][int(value)]
    workerInfo['device'] = device
    workerInfoList.append(workerInfo)

df = pd.DataFrame.from_dict(workerInfoList)
df

Data read from path: ../Data/ZRXSP/task_data.json
Data read from path: ../Data/ZRXSP/data_try_1.json
Data read from path: ../Data/AGGBZ/task_data.json
Data read from path: ../Data/AGGBZ/data_try_1.json
Data read from path: ../Data/LZAEJ/task_data.json
Data read from path: ../Data/LZAEJ/data_try_1.json
Data read from path: ../Data/DUMDJ/task_data.json
Data read from path: ../Data/DUMDJ/data_try_1.json
Data read from path: ../Data/ZLAXX/task_data.json
Data read from path: ../Data/ZLAXX/data_try_1.json
Data read from path: ../Data/HQWLU/task_data.json
Data read from path: ../Data/HQWLU/data_try_1.json
Data read from path: ../Data/BOJHZ/task_data.json
Data read from path: ../Data/BOJHZ/data_try_1.json
Data read from path: ../Data/QMJTN/task_data.json
Data read from path: ../Data/QMJTN/data_try_1.json
Data read from path: ../Data/EUOBG/task_data.json
Data read from path: ../Data/EUOBG/data_try_1.json
Data read from path: ../Data/JHEEH/task_data.json
Data read from path: ../Data/JHEEH/data_t

Unnamed: 0,Età,Lavoro,Genere letterario preferito,kindle,Importanza,Quanti libri,device
0,36-45,Dipendente,Biografia,No,Scarsa,3-5,Mac
1,46-55,Freelancer,Formazione,No,Discreta,3-5,Windows
2,18-25,Studente,Rosa,Si,Fondamentale,0,iPhone
3,18-25,Studente,Psicologico,Si,Fondamentale,11+,Android
4,18-25,Studente,Giallo,Si,Discreta,1-2,Android
5,18-25,Studente,Giallo,Si,Discreta,6-10,Windows
6,18-25,Studente,Biografia,No,Fondamentale,6-10,Windows
7,18-25,Studente,Commedia,No,Fondamentale,6-10,iPhone
8,18-25,Studente,Sci-Fi,Si,Scarsa,1-2,Linux
9,18-25,Studente,Avventura,No,Fondamentale,6-10,Android


In [6]:
sunburst_df = df.copy()
sunburst_df["kindle"] = sunburst_df["kindle"].apply({'Si': 'Ha un<br>eReader', 'No': 'Non ha un<br>eReader'}.get)
sunburst_device = px.sunburst(sunburst_df, path=['Età', 'Lavoro', 'device', 'kindle'])
sunburst_device.show()
force = False
if not os.path.isfile('./plot/sunburst_device.html') or force:
    sunburst_device.write_html('./plot/sunburst_device.html')
    sunburst_device.write_image('./plot/sunburst_device.png')
sunburst_genre = px.sunburst(sunburst_df, path=['Età', 'Lavoro', 'Genere letterario preferito'])
sunburst_genre.show()
force = False
if not os.path.isfile('./plot/sunburst_genre.html') or force:
    sunburst_genre.write_html('./plot/sunburst_genre.html')
    sunburst_genre.write_image('./plot/sunburst_genre.png')


In [7]:
# -- ABSOLUTE AND RELATIVE FOR ALL QUEST PARAMETERS IN JSON -- 

# Count occourences
counter = {key:dict(Counter(df[key])) for key in df}

reformatQuestCount = {}
reformatQuestCountRel = {}

# Absolute frequencies for each option of each parameter
for key, value in reformatQuest.items():
    reformatQuestCount[key] = {label:counter[key].get(label, 0) for label in value}
    
# Relative frequencies for each option of each parameter 
for key, value in reformatQuestCount.items():
    total = sum(value.values())
    if total != 0:
        reformatQuestCountRel[key] = {label:counter[key].get(label, 0)/total for label in value.keys()}
    else:
        reformatQuestCountRel[key] = {label:0 for label in value.keys()}

# You can call this functions to save counts in json format on the current path
def serialize_count():
    serialize_json("./quest_count.json", reformatQuestCount)
    serialize_json("./quest_count_rel.json", reformatQuestCount)
    
            
# Eventually builds dataframe from json (according to the given parameter)        
def to_dataframe(param, rel=None):
    if not rel:
        return pd.DataFrame({param: reformatQuestCount[param].keys(),
                             'Frequenza': reformatQuestCount[param].values()})
    else:
        return pd.DataFrame({param: reformatQuestCountRel[param].keys(),
                             'Frequenza': reformatQuestCountRel[param].values()})

# Debugging test
# genre = to_dataframe("Genere letterario preferito")
# plt.xticks(rotation=70)
# plt.bar(genre["Genere letterario preferito"], genre["Genere letterario preferito"])
    

In [79]:
def get_freq(df, column):
    out_df = df\
        .groupby(by=column)\
        .count()[['device']]\
        .reset_index()\
        .rename(columns={'device':'f'})\
        .sort_values(by=['f'], ascending=False)

    return out_df

def to_relative_freq(df, column='f'):
    total_amount = df['f'].sum()
    df['f'] = df['f'] / total_amount
    
    return df

def save_plot(figure, name, force=False):
    if not os.path.isfile(f'./plot/{name}.html') or force:
        hist_age.write_html(f'./plot/{name}.html')
        hist_age.write_image(f'./plot/{name}.png')

In [80]:
# Convert the genre list into a df
df_genre = pd.DataFrame(reformatQuest['Genere letterario preferito'], columns=['Genere letterario preferito'])

# Count occurrences, transform in a df and rename
counts = df['Genere letterario preferito']\
    .value_counts()\
    .to_frame()\
    .rename(columns={'Genere letterario preferito': 'f'})

# Join the previous two, replace NaN with 0s and sort everything for better view
df_genre = df_genre\
    .join(counts, on='Genere letterario preferito')\
    .replace(np.NaN, 0)\
    .sort_values(by='f', ascending=False)

# From float to int
df_genre['f'] = df_genre['f'].astype(int)

# Relative freq
total_amount = df_genre['f'].sum()
df_genre['f'] = df_genre['f'] / total_amount

# Plot everything
bar_genre = px.bar(df_genre, x = 'Genere letterario preferito', y = 'f', labels={'f': 'Frequenza'}, color='Genere letterario preferito')

# Add % sign to the ticks
bar_genre.update_layout(yaxis_tickformat = '%')
bar_genre.show()
save_plot(bar_genre, 'bar_genre')

In [81]:
# Plot based on genre and age
df_genre_age = get_freq(df, ['Genere letterario preferito', 'Età'])

# Relative freq
df_genre_age = to_relative_freq(df_genre_age)

bar_genre_age = px.bar(df_genre_age, x = 'Genere letterario preferito', labels = {'f': 'Frequenza'}, y = 'f', color='Età')

# Add % sign to the ticks
bar_genre_age.update_layout(yaxis_tickformat = '%')

bar_genre_age.show()
save_plot(bar_genre_age, 'bar_genre_age')

In [82]:
# Create dataframe
df_age = df[['Età']]

# Normalize ages to create a 
age_normalized = []
for _, row in df_age.iterrows():
    key = row['Età']
    if key == '18-25':
        age_normalized.append(18)
    elif key == '26-35':
        age_normalized.append(26)
    elif key == '36-45':
        age_normalized.append(36)
    elif key == '46-55':
        age_normalized.append(46)
    else:
        age_normalized.append(56)

df_age_normalized = pd.DataFrame(age_normalized)

# plt.hist(df_age_normalized, bins=[18,26,36,46,56,100], density=True)
hist_age = px.histogram(df_age_normalized, histnorm='probability', range_x=[18,66], labels={'value':'Età'})

hist_age.update_traces(
    xbins=dict( # bins used for histogram
        start=16,
        end=80,
        size=10
    )
)

hist_age.update_layout(
    showlegend=False,
    yaxis_title='Frequenza'
)

hist_age.show()

save_plot(hist_age, 'hist_age')

In [83]:
# Kindle

df_kindle = get_freq(df, 'kindle')

# Relative freq
df_kindle = to_relative_freq(df_kindle)

bar_kindle = px.bar(df_kindle, x='kindle', y='f', color='kindle', labels={'kindle': 'Possessori di kindle', 'f': 'Frequenza'})

bar_kindle.update_layout(
    showlegend=False
)

bar_kindle.show()

save_plot(bar_kindle, 'bar_kindle')

In [103]:
# Importanza della lettura
df_reading = get_freq(df, 'Importanza')

# Sort based on Likert scale
df_reading = df_reading.reindex([2,3,0,1])

# Relative freq
df_reading = to_relative_freq(df_reading)

bar_reading = px.bar(df_reading, x='Importanza', y='f', color='Importanza', labels={'Importanza': 'Importanza della lettura', 'f': 'Frequenza'})

bar_reading.update_layout(
    showlegend=False
)

bar_reading.show()

save_plot(bar_reading, 'bar_reading')

In [119]:
# Number of books read per year
df_books_read = get_freq(df, 'Quanti libri')

# Sort based on Likert scale
df_books_read = df_books_read.reindex([0,1,3,4,2])

# Relative freq
df_books_read = to_relative_freq(df_books_read)

# Address the plotly bug which ignores the dtype
df_books_read['Quanti libri'][0] = 'Nessuno'

bar_books_read = px.bar(df_books_read, x='Quanti libri', y='f', color='Quanti libri', labels={'Quanti libri': 'Numero di libri letti in un anno', 'f': 'Frequenza'})

bar_books_read.update_layout(
    showlegend=False
)

bar_books_read.show()

save_plot(bar_reading, 'bar_reading')

In [55]:
commentDict = {}
for workerDir in os.scandir('../Data/'):
    if os.path.isfile(f'{workerDir.path}/comment_try_1.json'):
        comment = read_json(f'{workerDir.path}/comment_try_1.json')
        if comment['comment'] != '':
            commentDict[workerDir.name] = comment['comment']
serialize_json('comment_summary.json', commentDict)
commentDict


Data read from path: ../Data/ZRXSP/comment_try_1.json
Data read from path: ../Data/AGGBZ/comment_try_1.json
Data read from path: ../Data/LZAEJ/comment_try_1.json
Data read from path: ../Data/ZLAXX/comment_try_1.json
Data read from path: ../Data/HQWLU/comment_try_1.json
Data read from path: ../Data/BOJHZ/comment_try_1.json
Data read from path: ../Data/QMJTN/comment_try_1.json
Data read from path: ../Data/EUOBG/comment_try_1.json
Data read from path: ../Data/JHEEH/comment_try_1.json
Data read from path: ../Data/EVDDT/comment_try_1.json
Data read from path: ../Data/RYYES/comment_try_1.json
Data read from path: ../Data/HEWCH/comment_try_1.json
Data read from path: ../Data/SEYYG/comment_try_1.json
Data read from path: ../Data/VXCGS/comment_try_1.json
Data read from path: ../Data/UPVVU/comment_try_1.json
Data read from path: ../Data/CFRLM/comment_try_1.json
Data read from path: ../Data/DJTWC/comment_try_1.json
Data read from path: ../Data/SARZI/comment_try_1.json
Data read from path: ../Data

{'AGGBZ': "Mi ha fatto piacere partecipare. \nSpero di esservi stata d'aiuto nel vostro progetto universitario. \n",
 'ZLAXX': 'Il test è strutturato molto bene, non modificherei nulla',
 'EVDDT': 'Alla grandine, bel lavoro',
 'RYYES': "sono contento di essere stato utile per un'attività che servirà per la formazione di giovani studenti  ",
 'CFRLM': 'Grazie',
 'SARZI': 'Devo ancora capire il senso di qst test... ',
 'THRZI': 'Tutto perfetto! Davvero complimenti ',
 'EOTDA': 'Spero di essere stato utile',
 'IMHHT': 'Organizzazione ottima anche nella versione per telefono',
 'ITCXH': 'Ciao!! ',
 'HBIRW': 'Va bene così',
 'ACHTQ': 'La parte della giustificazione è troppo lunga. Obbligare ad inserire minimo 10 parole mi ha quasi fatto passar la voglia di fare questo questionario. ',
 'KAHOD': 'Grazie.',
 'HDLGN': 'Dal telefono quando passo da un libro all’altro preferirei ricaricasse la pagina dall’inizio senza costringermi a scorrere in alto.'}