<a href="https://colab.research.google.com/github/casset-org/app/blob/main/io.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from google.colab import drive
import requests
import io

# --- Constants and Configurations ---
DRIVE_MOUNT_PATH = "/content/drive"
DATA_PATH = os.path.join(DRIVE_MOUNT_PATH, "MyDrive", "data")  # New folder for data and graphs
DEFAULT_PALETTE = "Spectral"
FIGSIZE = (10, 6)
DATASET_URL = "https://docs.google.com/spreadsheets/d/1CukfW6MJBt0HoLYikhP46GsnzZva5K_ynKwPlAkVwQw/export?format=csv"

# Style configuration
sns.set(style="darkgrid")
plt.rcParams.update({
    'figure.facecolor': '#282c34',
    'axes.facecolor': '#282c34',
    'axes.edgecolor': 'white',
    'axes.labelcolor': 'lime',
    'xtick.color': 'white',
    'ytick.color': 'white',
    'text.color': 'white',
    'grid.color': 'gray',
    'grid.linestyle': '--',
    'legend.facecolor': '#282c34',
    'legend.edgecolor': 'white',
    'figure.titlesize': 18,
    'axes.titlesize': 14,
    'axes.labelsize': 12,
})

# --- Auxiliary Functions ---
def save_fig(filename):
    filepath = os.path.join(DATA_PATH, filename)  # Save graphs in the data folder
    os.makedirs(os.path.dirname(filepath), exist_ok=True)
    plt.savefig(filepath, dpi=300, bbox_inches='tight')
    plt.close()

def create_figure():
    return plt.figure(figsize=FIGSIZE)

def load_data_from_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        csv_content = response.content.decode('utf-8')
        df = pd.read_csv(io.StringIO(csv_content))
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

# --- Visualization Functions ---
def generate_boxplot(df, x, y, title="", filename="", hue=None):
    create_figure()
    if hue is not None:
        sns.boxplot(x=x, y=y, data=df, hue=hue, palette=DEFAULT_PALETTE)
    else:
        sns.boxplot(x=x, y=y, data=df, hue=x, palette=DEFAULT_PALETTE, legend=False)
    plt.title(title)
    plt.xticks(rotation=45, ha="right")
    save_fig(filename)

def generate_scatterplot(df, x, y, title="", filename="", hue=None, style=None):
    create_figure()
    sns.scatterplot(x=x, y=y, data=df, hue=hue, style=style, palette=DEFAULT_PALETTE, alpha=0.8)
    plt.title(title)
    save_fig(filename)

def generate_histogram(df, column, title="", filename="", bins=20, kde=True):
    create_figure()
    sns.histplot(data=df, x=column, kde=kde, bins=bins, color='skyblue')
    plt.title(title)
    save_fig(filename)

def generate_heatmap(df, title="", filename="", annot=True):
    create_figure()
    corr = df.corr(numeric_only=True)
    sns.heatmap(corr, annot=annot, cmap=DEFAULT_PALETTE, fmt=".2f", linewidths=.5)
    plt.title(title)
    save_fig(filename)

def generate_barplot(df, x, y, title="", filename="", hue=None):
    create_figure()
    sns.barplot(x=x, y=y, data=df, hue=hue, palette=DEFAULT_PALETTE, errorbar=None)
    plt.title(title)
    plt.xlabel(x)
    plt.ylabel(y)
    save_fig(filename)

# --- Statistical Analysis Functions ---
def perform_t_test(df, group_col, value_col, print_results=True):
    group1 = df[df[group_col] == True][value_col].dropna()
    group2 = df[df[group_col] == False][value_col].dropna()
    if group1.empty or group2.empty:
        print(f"Not enough data for t-test on {value_col}")
        return None, None
    t_stat, p_value = stats.ttest_ind(group1, group2, equal_var=False)
    if print_results:
        print(f"T-test ({value_col} - Casset vs. Non-Casset): t={t_stat:.3f}, p={p_value:.3f}")
    return t_stat, p_value

def generate_summary_statistics(df, filename="sumario.txt"):
    summary = df.describe(include='all').to_string()
    filepath = os.path.join(DATA_PATH, filename)
    os.makedirs(os.path.dirname(filepath), exist_ok=True)
    with open(filepath, "w") as f:
        f.write("Statistical Summary:\n")
        f.write(summary)
    print(f"Summary saved to {filepath}")

# --- Main Function ---
if __name__ == "__main__":
    # Mount Google Drive
    try:
        drive.mount(DRIVE_MOUNT_PATH, force_remount=True)
        print("Google Drive mounted successfully.")
    except Exception as e:
        print(f"Error mounting Google Drive: {e}")
        exit()

    # Create directories
    os.makedirs(DATA_PATH, exist_ok=True)

    # Load data
    df = load_data_from_url(DATASET_URL)
    if df is None:
        print("Failed to load data. Exiting the script.")
        exit()

    # Rename columns
    df.rename(columns={
        'projeto_id': 'ProjetoID',
        'usa_casset': 'UsaCasset',
        'tempo_etapa1': 'TempoEtapa1',
        'tempo_etapa2': 'TempoEtapa2',
        'tempo_etapa3': 'TempoEtapa3',
        'interacoes_docs': 'InteracoesDocs',
        'interacoes_email': 'InteracoesEmail',
        'interacoes_reunioes': 'InteracoesReunioes',
        'qualidade_produto': 'QualidadeProduto',
        'satisfacao_participantes': 'SatisfacaoParticipantes',
        'num_participantes': 'NumParticipantes',
        'centralidade_comunicacao': 'CentralidadeComunicacao',
        'diversidade_equipe': 'DiversidadeEquipe',
        'eficiencia_processual': 'EficienciaProcessual',
        'aderencia_objetivos': 'AderenciaObjetivos'
    }, inplace=True)

    # Convert 'UsaCasset' to boolean
    df['UsaCasset'] = df['UsaCasset'].astype(bool)

    # Data cleaning
    numeric_columns = [
        'ProjetoID', 'TempoEtapa1', 'TempoEtapa2', 'TempoEtapa3',
        'InteracoesDocs', 'InteracoesEmail', 'InteracoesReunioes',
        'QualidadeProduto', 'SatisfacaoParticipantes', 'NumParticipantes',
        'CentralidadeComunicacao', 'DiversidadeEquipe',
        'EficienciaProcessual', 'AderenciaObjetivos'
    ]
    for col in numeric_columns:
        df[col] = pd.to_numeric(df[col].astype(str).str.replace('[^\d\.]', '', regex=True), errors='coerce')
    df.dropna(subset=numeric_columns, inplace=True)

    # Analyses and visualizations
    df['TempoTotal'] = df['TempoEtapa1'] + df['TempoEtapa2'] + df['TempoEtapa3']
    generate_boxplot(df, 'UsaCasset', 'TempoTotal',
                     title="Total Project Time (Casset vs. Non-Casset)",
                     filename="1_boxplot_tempo_total.png")
    generate_boxplot(df, 'UsaCasset', 'QualidadeProduto',
                     title="Final Product Quality (Casset vs. Non-Casset)",
                     filename="2_boxplot_qualidade.png")
    generate_boxplot(df, 'UsaCasset', 'SatisfacaoParticipantes',
                     title="Participant Satisfaction (Casset vs. Non-Casset)",
                     filename="3_boxplot_satisfacao.png")
    df['InteracoesTotais'] = df['InteracoesDocs'] + df['InteracoesEmail'] + df['InteracoesReunioes']
    generate_barplot(df, 'UsaCasset', 'QualidadeProduto',
                     title="Average Product Quality by Casset Usage",
                     filename="4_barplot_qualidade.png", hue='UsaCasset')
    generate_scatterplot(df, 'CentralidadeComunicacao', 'TempoTotal',
                         title="Communication Centrality vs. Total Time",
                         filename="5_scatter_centralidade_tempo.png", hue='UsaCasset')
    generate_histogram(df, 'QualidadeProduto',
                       title="Distribution of Final Product Quality",
                       filename="6_hist_qualidade.png")
    generate_histogram(df, 'SatisfacaoParticipantes',
                       title="Distribution of Participant Satisfaction",
                       filename="7_hist_satisfacao.png")
    generate_heatmap(df, title="Correlation Heatmap",
                     filename="9_heatmap.png")

    # T-tests
    perform_t_test(df, 'UsaCasset', 'TempoTotal')
    perform_t_test(df, 'UsaCasset', 'QualidadeProduto')
    perform_t_test(df, 'UsaCasset', 'SatisfacaoParticipantes')

    # Generate summary statistics
    generate_summary_statistics(df)

    print("Casset Analysis Complete!")

Mounted at /content/drive
Google Drive mounted successfully.
T-test (TempoTotal - Casset vs. Non-Casset): t=7.676, p=0.000
T-test (QualidadeProduto - Casset vs. Non-Casset): t=16.616, p=0.000
T-test (SatisfacaoParticipantes - Casset vs. Non-Casset): t=16.105, p=0.000
Summary saved to /content/drive/MyDrive/data/sumario.txt
Casset Analysis Complete!
