<a href="https://colab.research.google.com/github/aShYousef/Freecode2/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
from google.colab import drive

MOUNT_PATH = '/content/drive'

try:
    print(f"Mounting Google Drive to {MOUNT_PATH}...")
    drive.mount(MOUNT_PATH, force_remount=True)
    print("Drive mounted successfully.")

except Exception as e:
    print(f"Failed to mount Drive: {e}")

Mounting Google Drive to /content/drive...
Mounted at /content/drive
Drive mounted successfully.


In [None]:
!pip install -q pyspark streamlit pyngrok

import os
import time
import json
import shutil
import pandas as pd
import numpy as np
from datetime import datetime
from pyngrok import ngrok
from google.colab import drive, userdata

MOUNT_POINT = '/content/drive'

try:
    if os.path.exists(MOUNT_POINT):
        drive.mount(MOUNT_POINT, force_remount=True)
    else:
        drive.mount(MOUNT_POINT)
    print(f"Drive mounted successfully at {MOUNT_POINT}")
except Exception as e:
    print(f"Drive mount failed: {e}")

try:
    AUTH_TOKEN = userdata.get('NGROK_TOKEN')
    if AUTH_TOKEN:
        ngrok.set_auth_token(AUTH_TOKEN)
        print("Ngrok authenticated.")
    else:
        print("Notice: NGROK_TOKEN not found in secrets.")
except Exception as e:
    print(f"Ngrok authentication error: {e}")

streamlit_app_code = r'''
import streamlit as st
import time
import pandas as pd
import json
import os
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor
from pyspark.ml.clustering import KMeans

st.set_page_config(page_title="Distributed Data System", layout="wide")

BASE_OUTPUT_PATH = "/content/drive/MyDrive/University_Project_Results"

def save_dataframe_to_drive(df, output_dir, filename):
    try:
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        full_path = os.path.join(output_dir, filename)
        df.to_csv(full_path, index=False)
        return True, full_path
    except Exception as e:
        return False, str(e)

def initialize_spark_session(app_name="CloudCluster"):
    return SparkSession.builder.master("local[*]").appName(app_name).getOrCreate()

spark = initialize_spark_session()

with st.sidebar:
    st.header("System Information")
    st.info("Module: Cloud & Distributed Systems")

    st.markdown("""
    <div style="background-color: #e2e3e5; padding: 10px; border-radius: 5px; border: 1px solid #d6d8db;">
        <h4 style="color: #383d41; margin:0;">Developed by:</h4>
        <p style="color: #383d41; font-weight: bold; margin:0;">Alaa Yousef & Misk Ashour</p>
    </div>
    """, unsafe_allow_html=True)

    st.write("---")
    st.success("Storage Status: Online")

    report_view_enabled = st.checkbox("Enable Report View", value=False)

    st.write("---")

    if st.button("Save Full Report to Cloud"):
        if "current_dataframe" in st.session_state and st.session_state.current_dataframe is not None:
            current_ts = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
            report_dir = os.path.join(BASE_OUTPUT_PATH, f"Report_{current_ts}")

            os.makedirs(report_dir, exist_ok=True)
            saved_items = []

            if "current_dataframe" in st.session_state:
                 try:
                    sdf = spark.createDataFrame(st.session_state.current_dataframe)
                    stats_df = sdf.describe().toPandas()
                    save_dataframe_to_drive(stats_df, report_dir, "1_statistics.csv")
                    saved_items.append("Statistics")
                 except: pass

            if "ml_results" in st.session_state:
                save_dataframe_to_drive(st.session_state.ml_results, report_dir, "2_ml_results.csv")
                saved_items.append("ML Results")

            if "scalability_data" in st.session_state:
                save_dataframe_to_drive(st.session_state.scalability_data, report_dir, "3_scalability_test.csv")
                saved_items.append("Scalability Metrics")

            st.success(f"Report Directory Created: Report_{current_ts}")
            st.info(f"Archived: {', '.join(saved_items)}")
        else:
            st.error("No data available to persist.")

if report_view_enabled:
    st.markdown("<h1 style='text-align: center;'>Project Final Report</h1>", unsafe_allow_html=True)
    st.markdown(f"<h3 style='text-align: center;'>Developers: Alaa Yousef & Misk Ashour</h3>", unsafe_allow_html=True)
else:
    st.title("Cloud-Based Distributed Data Processing Service")

if "current_dataframe" not in st.session_state:
    st.session_state.current_dataframe = None

if not report_view_enabled:
    st.subheader("1. Dataset Ingestion")
    uploaded_file = st.file_uploader("Select Data Source", type=["csv", "json", "txt"])

    if uploaded_file:
        try:
            filename = uploaded_file.name
            if filename.endswith('.csv'):
                raw_df = pd.read_csv(uploaded_file)
            elif filename.endswith('.json'):
                json_data = json.load(uploaded_file)
                raw_df = pd.DataFrame(json_data if isinstance(json_data, list) else [json_data])
            else:
                raw_df = pd.read_csv(uploaded_file, sep="\t")

            st.session_state.current_dataframe = raw_df.dropna()

            save_dataframe_to_drive(st.session_state.current_dataframe, BASE_OUTPUT_PATH, f"backup_{filename}")

            st.success(f"Ingestion Successful. Records: {len(raw_df)}.")

        except Exception as e:
            st.error(f"Ingestion Failure: {e}")

if st.session_state.current_dataframe is not None:
    pd_df = st.session_state.current_dataframe

    try:
        spark_df = spark.createDataFrame(pd_df)
    except:
        spark = initialize_spark_session()
        spark_df = spark.createDataFrame(pd_df)

    def view_statistics():
        st.header("Statistical Analysis")
        col1, col2 = st.columns(2)
        col1.metric("Total Observations", spark_df.count())
        col2.metric("Feature Count", len(spark_df.columns))

        desc = spark_df.describe().toPandas()
        st.dataframe(desc, use_container_width=True)

    def view_machine_learning():
        st.header("Machine Learning Execution")
        numeric_fields = [f.name for f in spark_df.schema.fields if f.dataType.simpleString() in ['int', 'double', 'float', 'bigint', 'long']]

        c1, c2 = st.columns([1, 2])
        target_col = c1.selectbox("Target Variable (Y)", numeric_fields, key="target_sel")
        feature_cols = c2.multiselect("Predictors (X)", [n for n in numeric_fields if n != target_col], key="feat_sel")

        st.session_state.target = target_col
        st.session_state.features = feature_cols

        if st.button("Execute Models"):
            if not feature_cols:
                st.error("Please select at least one feature.")
            else:
                assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
                final_data = assembler.transform(spark_df).select("features", target_col).withColumnRenamed(target_col, "label")

                models = {
                    "Linear Regression": LinearRegression(featuresCol="features", labelCol="label"),
                    "Decision Tree": DecisionTreeRegressor(featuresCol="features", labelCol="label"),
                    "K-Means Clustering": KMeans(featuresCol="features", k=3)
                }

                logs = []
                p_bar = st.progress(0)

                for i, (name, model) in enumerate(models.items()):
                    start_t = time.time()
                    try:
                        model.fit(final_data)
                        exec_time = time.time() - start_t
                        logs.append({"Algorithm": name, "Execution Time (s)": round(exec_time, 4), "Status": "Success"})
                    except:
                        logs.append({"Algorithm": name, "Execution Time (s)": 0, "Status": "Failed"})

                    p_bar.progress((i + 1) / len(models))

                st.session_state.ml_results = pd.DataFrame(logs)

        if "ml_results" in st.session_state:
            st.table(st.session_state.ml_results)

    def view_scalability():
        st.header("Scalability & Performance Testing")

        feats = st.session_state.get('features', [])
        target = st.session_state.get('target', None)

        if st.button("Run Cluster Simulation (1, 2, 4, 8 Nodes)"):
            if not feats:
                st.error("Configuration missing in ML tab.")
            else:
                st.info("Starting simulation...")
                results = []
                nodes = [1, 2, 4, 8]
                baseline = 0

                global spark
                spark.stop()

                s_bar = st.progress(0)

                for i, n in enumerate(nodes):
                    tmp_spark = SparkSession.builder.master(f"local[{n}]").appName(f"Sim_{n}").getOrCreate()
                    tmp_df = tmp_spark.createDataFrame(pd_df)

                    vec = VectorAssembler(inputCols=feats, outputCol="features")
                    train = vec.transform(tmp_df).select("features", target).withColumnRenamed(target, "label")

                    t0 = time.time()
                    LinearRegression(featuresCol="features", labelCol="label").fit(train)
                    t_diff = time.time() - t0

                    if n == 1: baseline = t_diff

                    speedup = baseline / t_diff if t_diff > 0 else 0
                    eff = speedup / n if n > 0 else 0

                    results.append({
                        "Cluster Nodes": n,
                        "Processing Time (s)": t_diff,
                        "Speedup Factor": speedup,
                        "Efficiency": eff
                    })

                    tmp_spark.stop()
                    s_bar.progress((i + 1) / 4)

                spark = initialize_spark_session()
                st.session_state.scalability_data = pd.DataFrame(results)
                st.success("Simulation Complete.")

        if 'scalability_data' in st.session_state:
            res_df = st.session_state.scalability_data
            st.dataframe(res_df)

            g1, g2 = st.columns(2)
            with g1:
                st.subheader("Speedup Analysis")
                st.line_chart(res_df.set_index("Cluster Nodes")["Speedup Factor"])
            with g2:
                st.subheader("Efficiency Analysis")
                st.line_chart(res_df.set_index("Cluster Nodes")["Efficiency"])

            st.subheader("Execution Latency")
            st.bar_chart(res_df.set_index("Cluster Nodes")["Processing Time (s)"])

    if report_view_enabled:
        view_statistics()
        st.markdown("---")
        view_machine_learning()
        st.markdown("---")
        view_scalability()
    else:
        t1, t2, t3 = st.tabs(["Data Statistics", "ML Models", "Scalability Test"])
        with t1: view_statistics()
        with t2: view_machine_learning()
        with t3: view_scalability()
else:
    st.info("Awaiting dataset upload to initialize pipeline.")
'''

with open("app.py", "w") as f:
    f.write(streamlit_app_code)

get_ipython().system_raw('pkill -9 streamlit')
time.sleep(2)
get_ipython().system_raw('nohup streamlit run app.py --server.port 8501 &')

try:
    ngrok.kill()
    url = ngrok.connect(8501).public_url
    print(f"Service Available at: {url}")
except Exception as e:
    print(f"Tunneling Error: {e}")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m68.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m94.3 MB/s[0m eta [36m0:00:00[0m
[?25hMounted at /content/drive
Drive mounted successfully at /content/drive
Ngrok authenticated.
Service Available at: https://miss-centered-rustlingly.ngrok-free.dev
