<a href="https://colab.research.google.com/github/ced-sys/AI-N-ML/blob/main/GeoThermoAI(Draft).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install graphviz pydotplus



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_graphviz
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import graphviz
from IPython.display import display, Image
import pydotplus
import io

In [3]:
np.random.seed(42)

In [None]:
def generate_geothermal_data(n_samples=500):
  """
  Generate synthetic data for geothermal site classification with realistic relationships

  Parameters: n_samples (int): Number of smaples generate

  Returns: X(DataFrame): Features y(Features): Target labels
  """
  data=pd.DataFrame()

  #Heat flow (mW/ m**2)
  data['heat_flow']=np.random.gamma(shape=3, scale=20, size=n_samples)+20

  #Rock type (0: sedimentary, 1: plutonic, 2: Volcanic)
  data['rock_type']=np.random.choice([0,1,2], size=n_samples, p=[0.5, 0.3, 0.2])

  #Fault proximity (Km) - distance to nearest fault
  data['fault_distance']=np.random.exponential(scale=10, size=n_samples)

  #Hydrothermal alteration (0:None, 1: Weak, 2: Moderate, 3:Strong)
  data['alteration']=np.random.choice([0, 1, 2, 3], size=n_samples, p=[0.6, 0.3, 0.15, 0.05])

  #Resistivity (ohm-m)-low values can indicate hydrothermal fluids
  #Create a bimodal distribution to represent both conductive and resistive zones
  resistivity_high=np.random.lognormal(mean=5, sigma=1, size=int(n_samples*0.7))
  resistivity_low=np.random.lognormal(mean=2, sigma=0.5, size=n_samples-int(n_samples*0.7))
  data['resistivity']=np.concatenate([resistivity_high, resistivity_low])
  np.random.shuffle(data['resistivity'])

  #Surface Temperature anomaly
  data['temp_anomaly']=np.random.exponential(scale=2, size=n_samples)

  #Presence of Hot Springs or Fumaroles (0:None, 1:Present)
  data['hot_springs']=np.random.choice([0, 1], size=n_samples, p=[0.85, 0.15])

  #Seismic activity (events per year in the vicinity)
  data['seismic_activity']=np.random.poisson(lam=2, size=n_samples)

  #Gravity anomal (mGal)
  data['gravity_anomaly']=np.random.normal(loc=0, scale=5, size=n_samples)

  #Geochemical Signatures (0:None, 1: Weak, 2: Strong)
  data['geochemical']=np.random.choice([0, 1, 2], size=n_samples, p=[0.7, 0.2, 0.1])

  #Createtarget variable based on realistic geological rules
  y=np.zeros(n_samples, dtype=int)

  #High potential (2) criteria
  high_potential=(
      ((data['heat_flow']>80)& (data['fault_distance']<5)& (data['resistivity']<20))|
      ((data['heat_flow']>80)& (data['fault_distance']<5)& (data['seismic_activity']>5))|
      ((data['hot_springs']>80)& (data['rock_type']==2)&(data['alteration']>=2)) |
      ((data['hot_sprigs']==1)& (data['alteration']>=2)& (data['temp_anomaly']>3)) |
      ((data['hot_springs']==1)& (data['geochemical']==2)& (data['resistivity']<30))

  )
  y[high_potential]=2

  #Moderate Potential (1) Criteria
  moderate_potential=(
      ((data['heat_flow']>60)& (data['heat_flow']<=80)& (data['fault_distance']<10))|
      ((data['heat_flow']>80)& (data['fault_distance']>=5)& (data['fault_distance']<10))|
      ((data['hot_springs']==1)& (data['alteration']==1))|
      ((data['heat_flow']>60)& (data['rock_type']==1)& (data['fault_distance']<15))|
      ((data['heat_flow']<=60)& (data['hot_springs']==1)&(data['alteration']<2))|
      ((data['heat_flow']>60)& (data['heat_flow']<=80)& (data['gravity_anomaly'].abs()>5))
  )

  #Only set to moderate ifnot already set to high
  moderate_potential=moderate_potential & (y!=2)
  y[moderate_potential]=1

  #Low potential (0) is default

  #Create target names forclarity
  y_names=pd.Series(y).map({0: "Low", 1:"Moderate", 2:"High"})

  #Shuffle data
  shuffled_indices=np.random.permutation(n_samples)
  return data.iloc[shuffled_indices].reset_index(drop=True), y[shuffled_indices], y_names.iloc[shuffled_indices].reset_index(drop=True)


