In [None]:
import pandas as pd
import numpy as np

# Sample lapse rate in degrees Celsius per kilometer
LAPSE_RATE = -0.0065 # typical lapse rate in °C/km

# Sample DataFrame df with columns 'date', 'station', 'elevation', 'Tmin', 'Tmax'
# Ensure 'date' is in datetime format
df['date'] = pd.to_datetime(df['date'])

def fill_missing_with_elevation(df, lapse_rate):
    # Group data by date
    grouped = df.groupby('date')

    filled_df = pd.DataFrame()

    for date, group in grouped:
        # Find missing values
        missing_tmin = group['Tmin'].isna()
        missing_tmax = group['Tmax'].isna()
        
        # If there are missing values
        if missing_tmin.any() or missing_tmax.any():
            # Use non-missing values for interpolation
            known_tmin = group.loc[~missing_tmin, 'Tmin']
            known_elevations_tmin = group.loc[~missing_tmin, 'elevation']
            
            known_tmax = group.loc[~missing_tmax, 'Tmax']
            known_elevations_tmax = group.loc[~missing_tmax, 'elevation']
            
            # Calculate missing Tmin
            for i, row in group[missing_tmin].iterrows():
                # Calculate estimated Tmin based on lapse rate
                if not known_tmin.empty:
                    avg_tmin = known_tmin.mean()
                    avg_elevation_tmin = known_elevations_tmin.mean()
                    est_tmin = avg_tmin + lapse_rate * (row['elevation'] - avg_elevation_tmin) / 1000.0
                    df.at[i, 'Tmin'] = est_tmin
            
            # Calculate missing Tmax
            for i, row in group[missing_tmax].iterrows():
                # Calculate estimated Tmax based on lapse rate
                if not known_tmax.empty:
                    avg_tmax = known_tmax.mean()
                    avg_elevation_tmax = known_elevations_tmax.mean()
                    est_tmax = avg_tmax + lapse_rate * (row['elevation'] - avg_elevation_tmax) / 1000.0
                    df.at[i, 'Tmax'] = est_tmax
        
        # Append the filled group to the DataFrame
        filled_df = pd.concat([filled_df, group], ignore_index=True)
    
    return filled_df

# Fill missing temperatures using elevation differences
df_filled = fill_missing_with_elevation(df, LAPSE_RATE)

# Display the DataFrame with missing temperatures filled
print(df_filled.head())


In [5]:
import pandas as pd
import openpyxl 
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
from pykrige.ok import OrdinaryKriging
from shapely.geometry import Point
from geopandas.tools import sjoin

In [6]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

In [13]:
def calculate_temp(row, temp_column, df):
    if pd.isna(row[temp_column]):  # If the temperature (e.g., Tmin) is NaN
        # Filter the data for the same date and valid temperature values
        same_date_data = df[(df['date'] == row['date']) & ~pd.isna(df[temp_column])]
        
        if not same_date_data.empty:
            # Calculate the average temperature of available stations
            avg_temp = same_date_data[temp_column].mean()
            
            # Calculate the average elevation difference
            avg_elevation_diff = (same_date_data['elevation'] - row['elevation']).mean()
            
            # Apply the lapse rate adjustment using the average elevation difference
            lapse_rate = -6.5 / 1000  # Lapse rate in °C per meter
            adjusted_temp = avg_temp + avg_elevation_diff * lapse_rate
            
            return adjusted_temp  # Return the adjusted temperature
        
    return row[temp_column]  # If temperature is not NaN, return the original value

In [32]:
play_1df = pd.read_csv(r'G:\fresh_start\paper\code_paper\main_data\raw_data\df33333333_Nan.csv')

In [35]:

grouped = play_1df.groupby('date')
for  date, group in grouped:
    print(date, group)
    break


1962-01-01               date  station          regions        lat       long  elevation  \
0       1962-01-01     1316            Tarai  26.820440  87.159170      105.0   
22280   1962-01-01     1201    High Mountain  27.816670  86.716670     3450.0   
44560   1962-01-01     1401    High Mountain  27.683330  87.783330     3119.0   
66840   1962-01-01     1225    High Mountain  27.816670  86.716670     3700.0   
89120   1962-01-01     1218    High Mountain  27.833330  86.766670     3857.0   
111400  1962-01-01     1206             Hill  27.308121  86.504225     1731.0   
133687  1962-01-01     1405             Hill  27.358611  87.670000     1744.0   
155974  1962-01-01     1103             Hill  27.630447  86.232114     1877.0   
178280  1962-01-01     1036             Hill  27.645134  85.620881      857.0   
200560  1962-01-01     1016  Middle Mountain  27.944561  85.595136     2574.0   
222840  1962-01-01     1123             Hill  27.394703  86.061233      497.0   
245120  1962-01-0

In [29]:
df = play_1df

In [30]:
df_missing_Tmin = df[df[target_Tmin].isna()]
print(df_missing_Tmin)

              date  station        regions        lat       long  elevation  \
0       1962-01-01     1316          Tarai  26.820440  87.159170      105.0   
1       1962-01-02     1316          Tarai  26.820440  87.159170      105.0   
2       1962-01-03     1316          Tarai  26.820440  87.159170      105.0   
3       1962-01-04     1316          Tarai  26.820440  87.159170      105.0   
4       1962-01-05     1316          Tarai  26.820440  87.159170      105.0   
...            ...      ...            ...        ...        ...        ...   
512367  2022-07-13     9999  High Mountain  27.961111  86.808889     5200.0   
512376  2022-07-22     9999  High Mountain  27.961111  86.808889     5200.0   
512380  2022-07-26     9999  High Mountain  27.961111  86.808889     5200.0   
512382  2022-07-28     9999  High Mountain  27.961111  86.808889     5200.0   
512398  2022-08-13     9999  High Mountain  27.961111  86.808889     5200.0   

        Tmin  Tmax  
0        NaN   NaN  
1        

In [31]:
df_missing_Tmax = df[df[target_Tmax].isna()]
print(df_missing_Tmax)

              date  station        regions        lat       long  elevation  \
0       1962-01-01     1316          Tarai  26.820440  87.159170      105.0   
1       1962-01-02     1316          Tarai  26.820440  87.159170      105.0   
2       1962-01-03     1316          Tarai  26.820440  87.159170      105.0   
3       1962-01-04     1316          Tarai  26.820440  87.159170      105.0   
4       1962-01-05     1316          Tarai  26.820440  87.159170      105.0   
...            ...      ...            ...        ...        ...        ...   
512367  2022-07-13     9999  High Mountain  27.961111  86.808889     5200.0   
512376  2022-07-22     9999  High Mountain  27.961111  86.808889     5200.0   
512380  2022-07-26     9999  High Mountain  27.961111  86.808889     5200.0   
512382  2022-07-28     9999  High Mountain  27.961111  86.808889     5200.0   
512398  2022-08-13     9999  High Mountain  27.961111  86.808889     5200.0   

        Tmin  Tmax  
0        NaN   NaN  
1        

In [23]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

# Load your data
# df = pd.read_csv('your_data.csv')  # Replace with your data loading method

# Convert 'date' to numerical features
df['date'] = pd.to_datetime(df['date'])
df['days_since_start'] = (df['date'] - df['date'].min()).dt.days

# Define features and target variables
features = ['days_since_start', 'station', 'lat', 'long', 'elevation']
target_Tmin = 'Tmin'
target_Tmax = 'Tmax'

# Separate data with missing values from data without missing values
df_missing_Tmin = df[df[target_Tmin].isna()]
df_missing_Tmax = df[df[target_Tmax].isna()]
df_non_missing = df.dropna(subset=[target_Tmin, target_Tmax])

# Prepare the training data
X_train = df_non_missing[features]
y_train_Tmin = df_non_missing[target_Tmin]
y_train_Tmax = df_non_missing[target_Tmax]

# Train models
model_Tmin = LinearRegression()
model_Tmax = LinearRegression()

model_Tmin.fit(X_train, y_train_Tmin)
model_Tmax.fit(X_train, y_train_Tmax)

# Prepare data for prediction
X_missing_Tmin = df_missing_Tmin[features]
X_missing_Tmax = df_missing_Tmax[features]

# Predict missing values
df.loc[df[target_Tmin].isna(), target_Tmin] = model_Tmin.predict(X_missing_Tmin)
df.loc[df[target_Tmax].isna(), target_Tmax] = model_Tmax.predict(X_missing_Tmax)

# Check results
print(df)


             date  station        regions        lat       long  elevation  \
0      1962-01-01     1316          Tarai  26.820440  87.159170      105.0   
1      1962-01-02     1316          Tarai  26.820440  87.159170      105.0   
2      1962-01-03     1316          Tarai  26.820440  87.159170      105.0   
3      1962-01-04     1316          Tarai  26.820440  87.159170      105.0   
4      1962-01-05     1316          Tarai  26.820440  87.159170      105.0   
...           ...      ...            ...        ...        ...        ...   
512534 2022-12-27     9999  High Mountain  27.961111  86.808889     5200.0   
512535 2022-12-28     9999  High Mountain  27.961111  86.808889     5200.0   
512536 2022-12-29     9999  High Mountain  27.961111  86.808889     5200.0   
512537 2022-12-30     9999  High Mountain  27.961111  86.808889     5200.0   
512538 2022-12-31     9999  High Mountain  27.961111  86.808889     5200.0   

             Tmin       Tmax  days_since_start  
0       19.482