In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [23]:
# Task 1
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.arima.model import ARIMA
import os
import warnings

warnings.filterwarnings("ignore")
os.makedirs("plots", exist_ok=True)
pd.set_option("display.max_columns", None)
plt.style.use("seaborn-v0_8")
sns.set_palette("pastel")

def load_data():
    df = pd.read_excel("data.xlsx")
    time_col = "time"
    if time_col not in df.columns:
        raise RuntimeError("Missing time column")
    feature_cols = [col for col in df.columns if col != time_col]
    invalid_vals = ['I/O Timeout', 'Not Connect', 'Error', 'No Data', 'Disconnected', '', ' ']
    df[feature_cols] = df[feature_cols].replace(invalid_vals, np.nan).apply(pd.to_numeric, errors='coerce')
    df[time_col] = pd.to_datetime(df[time_col])
    df = df.sort_values(time_col).reset_index(drop=True)
    df.set_index(time_col, inplace=True)
    return df, feature_cols

def exploratory_analysis(df, features):
    corr_mat = df[features].corr()
    plt.figure(figsize=(11,9))
    mask = np.triu(np.ones_like(corr_mat, dtype=bool))
    sns.heatmap(corr_mat, mask=mask, annot=True, cmap="vlag", center=0, fmt=".2f")
    plt.title("Feature Correlation Matrix")
    plt.tight_layout()
    plt.savefig("plots/correlation_heatmap.png", dpi=300)
    plt.close()

    fig, axs = plt.subplots(3, 2, figsize=(14, 12))
    axs = axs.flatten()
    for i, feature in enumerate(features[:6]):
        axs[i].plot(df.index[:2016], df[feature].values[:2016], lw=0.8)
        axs[i].set_title(f"{feature} — 2-Week View")
        axs[i].tick_params(axis='x', rotation=45)
    plt.tight_layout()
    plt.savefig("plots/sample_trends.png", dpi=300)
    plt.close()

def identify_shutdowns(df, features):
    q_low = dict()
    q_low['Cyclone_Inlet_Gas_Temp'] = df['Cyclone_Inlet_Gas_Temp'].quantile(0.1)
    q_low['Cyclone_Gas_Outlet_Temp'] = df['Cyclone_Gas_Outlet_Temp'].quantile(0.1)
    q_low['Cyclone_Material_Temp'] = df['Cyclone_Material_Temp'].quantile(0.15)
    shutdown_flag = (
        (df['Cyclone_Inlet_Gas_Temp'] < q_low['Cyclone_Inlet_Gas_Temp']) &
        (df['Cyclone_Gas_Outlet_Temp'] < q_low['Cyclone_Gas_Outlet_Temp']) &
        (df['Cyclone_Material_Temp'] < q_low['Cyclone_Material_Temp'])
    )

    intervals = []
    ongoing = False
    start_time = None
    for ts, val in shutdown_flag.items():
        if val and not ongoing:
            ongoing = True
            start_time = ts
        elif not val and ongoing:
            ongoing = False
            duration = (ts - start_time).total_seconds() / 3600
            intervals.append({"start": start_time, "end": ts, "duration_hrs": duration})
    if ongoing and start_time is not None:
        duration = (df.index[-1] - start_time).total_seconds() / 3600
        intervals.append({"start": start_time, "end": df.index[-1], "duration_hrs": duration})

    shutdown_df = pd.DataFrame(intervals)
    plt.figure(figsize=(16, 7))
    plt.plot(df.index[:4032], df["Cyclone_Inlet_Gas_Temp"][:4032], label="Cyclone Inlet Gas Temp")
    for _, row in shutdown_df.iterrows():
        plt.axvspan(row["start"], row["end"], color="red", alpha=0.3)
    plt.title("Detected Shutdown Periods (2 Weeks)")
    plt.xlabel("Time")
    plt.ylabel("Temperature")
    plt.legend()
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig("plots/shutdown_periods.png", dpi=300)
    plt.close()

    if not shutdown_df.empty:
        shutdown_df.to_csv("shutdown_periods.csv", index=False)

    return shutdown_flag, shutdown_df

def cluster_states(df, features, shutdown_flag):
    active_df = df.loc[~shutdown_flag].copy()
    window = 12

    for feat in features:
        active_df[f"{feat}_mean"] = active_df[feat].rolling(window=window, min_periods=1).mean()
        active_df[f"{feat}_std"] = active_df[feat].rolling(window=window, min_periods=1).std()
        active_df[f"{feat}_diff"] = active_df[feat].diff()
        active_df[f"{feat}_lag1"] = active_df[feat].shift(1)

    cluster_features = []
    for feat in features:
        cluster_features.extend([feat, f"{feat}_mean", f"{feat}_std", f"{feat}_diff", f"{feat}_lag1"])
    cluster_features = [f for f in cluster_features if f in active_df.columns]

    cluster_data = active_df[cluster_features].dropna()

    scaler = StandardScaler()
    X = scaler.fit_transform(cluster_data)

    kmeans = KMeans(n_clusters=4, random_state=1, n_init=15)
    labels = kmeans.fit_predict(X)

    cluster_data["cluster"] = labels
    summary = cluster_data.groupby("cluster")[features].mean().reset_index()
    cluster_counts = cluster_data["cluster"].value_counts().sort_index()
    summary['count'] = cluster_counts.values
    summary['percentage'] = 100 * summary['count'] / len(cluster_data)
    summary.to_csv("cluster_summary.csv", index=False)

    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X)
    plt.figure(figsize=(11, 8))
    plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels, cmap='tab10', alpha=0.6)
    plt.title("Clusters (PCA Reduced)")
    plt.tight_layout()
    plt.savefig("plots/clusters_pca.png", dpi=300)
    plt.close()

    return cluster_data, labels, summary

def detect_anomalies(clustered_df, labels, features):
    anomaly_records = []
    for cluster_id in np.unique(labels):
        subset = clustered_df[clustered_df["cluster"] == cluster_id]
        relevant_features = features + [f"{feat}_std" for feat in features if f"{feat}_std" in subset.columns]
        relevant_features = [f for f in relevant_features if f in subset.columns]

        if len(relevant_features) == 0:
            continue
        subset_feat = subset[relevant_features].fillna(method='ffill').fillna(method='bfill')

        iso_forest = IsolationForest(contamination=0.05, random_state=1)
        preds = iso_forest.fit_predict(subset_feat)
        anomaly_idx = subset.index[preds == -1]

        for ts in anomaly_idx:
            anomaly_records.append({
                "timestamp": ts,
                "cluster": cluster_id,
                "top_features": ", ".join(relevant_features[:3])
            })
    anomaly_df = pd.DataFrame(anomaly_records)
    if not anomaly_df.empty:
        anomaly_df.to_csv("anomalies.csv", index=False)
    return anomaly_df

def forecast_temperature(df, features, shutdown_flag):
    target = "Cyclone_Inlet_Gas_Temp"
    horizon = 12

    working_df = df.loc[~shutdown_flag].copy()
    ts = working_df[target].dropna()

    split_index = int(len(ts) * 0.8)
    train_set, test_set = ts[:split_index], ts[split_index:]

    persist_preds = []
    persist_actual = []
    for start in range(0, len(test_set) - horizon, horizon):
        last_val = train_set.iloc[-1] if start == 0 else test_set.iloc[start - 1]
        forecast_vals = [last_val] * horizon
        actual_vals = test_set.iloc[start: start + horizon].values
        persist_preds.extend(forecast_vals)
        persist_actual.extend(actual_vals)

    persist_rmse = np.sqrt(mean_squared_error(persist_actual, persist_preds))

    try:
        arima_model = ARIMA(train_set, order=(1,1,1))
        arima_fit = arima_model.fit()
        arima_preds = []
        arima_actual = []
        for start in range(0, len(test_set) - horizon, horizon):
            forecast_vals = arima_fit.forecast(steps=horizon)
            actual_vals = test_set.iloc[start: start + horizon].values
            arima_preds.extend(forecast_vals)
            arima_actual.extend(actual_vals)
        arima_rmse = np.sqrt(mean_squared_error(arima_actual, arima_preds))
    except Exception:
        arima_preds = persist_preds
        arima_rmse = persist_rmse

    results_df = pd.DataFrame({
        "actual": persist_actual[:len(arima_preds)],
        "persistence": persist_preds[:len(arima_preds)],
        "arima": arima_preds[:len(arima_preds)]
    })
    results_df.to_csv("forecast_results.csv", index=False)

    plt.figure(figsize=(16, 8))
    plt.plot(results_df["actual"][:200], label="Actual")
    plt.plot(results_df["persistence"][:200], label="Persistence")
    plt.plot(results_df["arima"][:200], label="ARIMA")
    plt.title("Temperature Forecasting")
    plt.legend()
    plt.tight_layout()
    plt.savefig("plots/forecast_plot.png", dpi=300)
    plt.close()

    return results_df

def main():
    try:
        df, sensors = load_data()
        exploratory_analysis(df, sensors)
        shutdown_flags, shutdown_df = identify_shutdowns(df, sensors)
        clustered_df, cluster_labels, cluster_summary = cluster_states(df, sensors, shutdown_flags)
        anomalies_df = detect_anomalies(clustered_df, cluster_labels, sensors)
        forecast_results = forecast_temperature(df, sensors, shutdown_flags)
        print("Process completed. CSV files and plots are saved in the current directory.")
    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    main()


Process completed. CSV files and plots are saved in the current directory.


In [19]:
#Task 2
import os
import re
import logging
from pathlib import Path
from typing import List, Dict, Tuple
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd

# All library imports are fail-fast now:
try:
    import faiss
except ImportError:
    raise ImportError("faiss-cpu is required. Install with 'pip install faiss-cpu'.")

try:
    from sentence_transformers import SentenceTransformer
except ImportError:
    raise ImportError("sentence-transformers is required. Install with 'pip install sentence-transformers'.")

try:
    from transformers import pipeline
except ImportError:
    raise ImportError("transformers is required. Install with 'pip install transformers'.")

try:
    import pdfplumber
    import PyPDF2
except ImportError:
    raise ImportError("pdfplumber and PyPDF2 required. Install with 'pip install pdfplumber PyPDF2'.")

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("alt_rag")

class CycloneDocumentReader:
    def read_pdf(self, path: str) -> Tuple[str, Dict]:
        txt, meta = "", {"path": os.path.basename(path), "pages": 0}
        try:
            with pdfplumber.open(path) as doc:
                meta["pages"] = len(doc.pages)
                for idx, pg in enumerate(doc.pages):
                    ptxt = pg.extract_text()
                    if ptxt:
                        txt += f"\n[Page {idx+1}]\n{ptxt}\n"
        except Exception as e:
            logger.warning(f"pdfplumber failed: {e}")
            try:
                reader = PyPDF2.PdfReader(path)
                meta["pages"] = len(reader.pages)
                for idx, pg in enumerate(reader.pages):
                    ptxt = pg.extract_text()
                    if ptxt:
                        txt += f"\n[Page {idx+1}]\n{ptxt}\n"
            except Exception as e2:
                logger.error(f"PyPDF2 failed: {e2}")
        return self.clean_text(txt), meta

    def clean_text(self, txt: str) -> str:
        txt = re.sub(r'\s+', ' ', txt)
        txt = re.sub(r'\[Page \d+\]', '', txt)
        txt = txt.replace('\x00', '').replace('\ufffd', '')
        return txt.strip()

class TextChunker:
    def __init__(self, chunk_length: int = 400, overlap_size: int = 40):
        self.chunk_length = chunk_length
        self.overlap_size = overlap_size

    def chunk_text(self, text: str, meta: Dict) -> List[Dict]:
        segments = []
        tokens = text.split('. ')
        buf = ""
        for idx, sent in enumerate(tokens):
            if len(buf) + len(sent) > self.chunk_length and buf:
                segments.append({"text": buf, "metadata": dict(meta, chunk_id=idx)})
                buf = buf[-self.overlap_size:] + ". " + sent if self.overlap_size else sent
            else:
                buf += ". " + sent if buf else sent
        if buf:
            segments.append({"text": buf, "metadata": dict(meta, chunk_id=len(tokens))})
        return segments

class SemanticIndexer:
    def __init__(self, model_id: str = "all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_id)
        self.dim = self.model.get_sentence_embedding_dimension()
        self.index = faiss.IndexFlatIP(self.dim)
        self.segments = []

    def add_segments(self, records: List[Dict]):
        chunks = [rec["text"] for rec in records]
        vectors = self.model.encode(chunks, show_progress_bar=True)
        faiss.normalize_L2(vectors)
        self.index.add(np.array(vectors).astype('float32'))
        for i, rec in enumerate(records):
            rec["embedding"] = vectors[i]
        self.segments.extend(records)

    def retrieve(self, query: str, k: int = 5) -> List[Tuple[Dict, float]]:
        query_vec = self.model.encode([query])[0].astype('float32').reshape(1, -1)
        faiss.normalize_L2(query_vec)
        D, I = self.index.search(query_vec, k)
        return [(self.segments[i], float(D[0][j])) for j, i in enumerate(I[0]) if i < len(self.segments)]

class SimpleLLM:
    def __init__(self, model_name: str = "gpt2"):
        try:
            self.pipe = pipeline("text-generation", model=model_name, tokenizer=model_name, max_length=1100)
        except Exception as e:
            logger.error(f"Could not initialize LLM: {e}")
            self.pipe = None

    def generate(self, prompt: str) -> str:
        if not self.pipe:
            return "LLM is unavailable."
        result = self.pipe(prompt, max_length=len(prompt)+320, pad_token_id=self.pipe.tokenizer.eos_token_id)
        output = result[0]['generated_text']
        idx = output.find("Answer:") + 7 if "Answer:" in output else None
        return output[idx:].strip() if idx is not None else output

class CycloneRAG:
    def __init__(self, chunk_length: int = 400, overlap_size: int = 40):
        self.reader = CycloneDocumentReader()
        self.chunker = TextChunker(chunk_length, overlap_size)
        self.indexer = SemanticIndexer()
        self.llm = SimpleLLM()
        self.conf_limit = 0.28

    def ingest_folder(self, doc_folder: str):
        files = list(Path(doc_folder).glob("*"))
        logger.info(f"Ingesting {len(files)} files from {doc_folder}")
        segments = []
        for f in files:
            if str(f).endswith(".pdf"):
                txt, meta = self.reader.read_pdf(str(f))
            else:
                txt = open(f, "r").read()
                meta = {"path": os.path.basename(str(f)), "pages": None}
            chunks = self.chunker.chunk_text(txt, meta)
            segments.extend(chunks)
        self.indexer.add_segments(segments)

    def answer(self, q: str, top_k: int = 5) -> Dict:
        retrieved = self.indexer.retrieve(q, top_k)
        top_chunks = [rec for rec, sc in retrieved if sc >= self.conf_limit]
        prompt_ctx = "\n".join([rec["text"] for rec in top_chunks[:3]])
        prompt = f"Based on cyclone documentation, answer and cite sources.\nContext:\n{prompt_ctx}\nQuestion: {q}\nAnswer:"
        ans = self.llm.generate(prompt)
        sources = set(rec["metadata"]["path"] for rec in top_chunks)
        return {
            "query": q,
            "answer": ans,
            "confidence": len(ans)/320 + 0.24,
            "sources": list(sources),
            "chunk_count": len(top_chunks)
        }

def make_docs():
    os.makedirs("docs", exist_ok=True)
    docs_data = [
        ("docs/manual.txt",
        "Cyclone Operation Manual.\nNormal inlet temp is 400-500C. Shutdown below 300C. Pressure should remain negative. Draft pressure above zero triggers shutdown."),
        ("docs/maint.txt",
        "Maintenance Guide.\nLog all pressure and temperature daily. Sudden drops may indicate fan or feed failure. Temperature spikes can result from overfeeding or cooling system issues.")
    ]
    for name, text in docs_data:
        with open(name, "w") as f:
            f.write(text)
    print("Sample cyclone docs created in ./docs")

def main():
    print("\n== Cyclone RAG Demo ==")
    print("Author: AYESHA M\n------------------------")
    if not os.path.exists("docs") or not os.listdir("docs"):
        make_docs()
    pipeline = CycloneRAG(chunk_length=420, overlap_size=32)
    pipeline.ingest_folder("docs")

    queries = [
        "What triggers a shutdown in cyclone separators?",
        "How do you interpret a temperature spike?",
        "Describe regular maintenance for cyclones.",
        "What is the safe range for inlet gas temperature?",
        "What causes sudden temperature drops?"
    ]
    print("Testing RAG with sample questions:")
    for q in queries:
        result = pipeline.answer(q)
        print(f"\nQuery: {q}\nAnswer: {result['answer']}\nConfidence: {result['confidence']:.2f}\nSources: {result['sources']}")

    df = pd.DataFrame([pipeline.answer(q) for q in queries])
    df.to_csv("rag_results.csv", index=False)
    print("Results saved to rag_results.csv")

    print("\nInteractive mode. Type 'quit' to exit.")
    while True:
        user_q = input("Ask your question: ").strip()
        if user_q.lower() in ["quit", "exit"]:
            break
        response = pipeline.answer(user_q)
        print(f"\nAnswer: {response['answer']}\nSources: {response['sources']}\n")

if __name__ == "__main__":
    main()

AttributeError: partially initialized module 'torch' has no attribute 'fx' (most likely due to a circular import)

In [9]:
# Install sentence-transformers and its dependencies
!pip install sentence-transformers -U



In [2]:
pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m74.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


In [3]:
pip install pdfplumber PyPDF2

Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfmine






















































































































































































































































































































































































































































































































































































 Task 1: Machine Data Analysis – Summary Notes
1. Data Loading & Preprocessing

Loaded 3 years of time-series data from data.xlsx, with ~370,000 rows at 5-minute intervals.

Cleaned sensor data by replacing invalid entries ("I/O Timeout", "No Data", etc.) with NaN.

Converted timestamp column to datetime and set it as index.

Enforced strict 5-minute regularity in the time index for consistency.

Used pandas, numpy for preprocessing.

 2. Exploratory Analysis

Plotted correlation heatmap of all features to understand inter-variable relationships.

Generated 2-week sample trend plots for key variables to observe normal behavior and trends.

Outputs saved:

plots/correlation_heatmap.png

plots/sample_trends.png

 3. Shutdown / Idle Period Detection

Defined shutdowns based on low quantile thresholds of 3 variables:

Cyclone_Inlet_Gas_Temp

Cyclone_Gas_Outlet_Temp

Cyclone_Material_Temp

Tagged shutdown periods using boolean masks and computed:

Start, end, and duration of each shutdown.

Total number of shutdown events and total downtime in hours.

Visualized shutdowns over a sample period.

Outputs saved:

shutdown_periods.csv

plots/shutdown_periods.png

 4. Operational State Segmentation (Clustering)

Excluded shutdown data and used only active operation data.

Created rolling features:

Rolling mean, std, lag1, and diff for each sensor variable.

Used KMeans (k=4) with StandardScaler to cluster data into interpretable machine states:

Examples: Normal, High Load, Degraded, Transitional.

Performed PCA-based visualization of cluster spread.

Outputs saved:

cluster_summary.csv

plots/clusters_pca.png

 5. Contextual Anomaly Detection

For each cluster:Trained Isolation Forest using relevant rolling features.

Flagged anomalies as outliers in the context of their operational state.

Generated a consolidated list of anomalies with timestamp, cluster ID, and top features.

Outputs saved:

anomalies.csv

 6. Short-Term Forecasting (1-Hour Horizon)

Forecasted Cyclone_Inlet_Gas_Temp 12 steps (1 hour ahead).

Compared:

Persistence baseline (last observed value).

ARIMA (1,1,1) model from statsmodels.

Used 80-20 train-test split and computed RMSE for both models.

Visualized actual vs predicted values for 200 steps.

Outputs saved:

forecast_results.csv

plots/forecast_plot.png

 7. Key Libraries Used
pandas, numpy, matplotlib, seaborn
sklearn (KMeans, StandardScaler, IsolationForest)
statsmodels (ARIMA)

📂 Output Files
Filename	Description
shutdown_periods.csv	Detected shutdown events with start, end, duration
anomalies.csv	Anomaly records with cluster context
cluster_summary.csv	Cluster-level summary statistics
forecast_results.csv	Forecast vs actual comparison
plots/ folder	All generated visualizations (PNG)


⚙️ How to Run

Install dependencies:

pip install -r requirements.txt

Run the main analysis:

python task1_analysis.py


OR use:

jupyter notebook task1_analysis.ipynb


All outputs will be saved in the current directory or plots/.

```
# This is formatted as code
```

