In [None]:
import xgboost as xgb
import pandas as pd
import requests
import numpy as np
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler,OrdinalEncoder
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.base import BaseEstimator, TransformerMixin
from datetime import datetime



conf = [(datetime(2020, 3, 17), datetime(2020, 5, 10)), (datetime(2020, 10, 30), datetime(2020, 11, 27)), (datetime(2021, 4, 3), datetime(2021, 5, 2))]
conf_a = [(datetime(2020, 5, 11), datetime(2020, 6, 1)), (datetime(2020, 11, 28), datetime(2020, 12, 15))]
c_v = [(datetime(2020, 12, 16), datetime(2021, 4, 2)), (datetime(2021, 5, 3), datetime(2021, 6, 20))]

def is_in_range(date):
    for start, end in conf:
        if start <= date <= end:
            return 3
    for start, end in conf_a:
        if start <= date <= end:
            return 2
    for start, end in c_v:
        if start <= date <= end:
            return 1
    if date == datetime(2020, 12, 25):
        return 0
    return 0

def apply_confinement_status(X):
    X = X.copy()  # To avoid modifying the original DataFrame
    X['Confinement'] = X['date'].apply(is_in_range)
    return X


def _encode_dates(X):
    X = X.copy()
    X['date'] = pd.to_datetime(X['date'])
    X['year'] = X['date'].dt.year
    X['month'] = X['date'].dt.month
    X['day'] = X['date'].dt.day
    #X['weekday'] = X['date'].dt.weekday
    X['hour'] = X['date'].dt.hour
    return X

def encode_week_end(X):
    # Assuming X is a DataFrame with a 'date' column
    # The 'date' column should be in datetime format
    X = X.copy()  # To avoid modifying the original DataFrame
    X['Week_day'] = X['date'].dt.dayofweek.apply(lambda x: 'Week_day' if x in range(0, 5) else 'Weekend')
    return X

def encode_season(X):
   
    # Assuming X is a DataFrame with a 'month' column
    X = X.copy()  # To avoid modifying the original DataFrame
    X['season'] = X['month'].apply(lambda x: 'Winter' if x in [12, 1, 2] 
                                          else 'Spring' if x in [3, 4, 5] 
                                          else 'Summer' if x in [6, 7, 8] 
                                          else 'Autumn')
    
    return X

def split_address_and_map_direction(X):
    # Split the 'counter_name' into two columns 'Address' and 'Direction'
    X[['Address', 'Direction']] = X['counter_name'].str.rsplit(' ', n=1, expand=True)

    # Define the direction mapping
    direction_mapping = {
        'E-O': 1, 'O-E': -1, 'NO-SE': 1, 'SE-NO': -1, 'SO-NE': 1, 'SE-NO': -1, 'N-S': 1, 'S-N': -1,
    }

    # Apply the mapping to the 'Direction' column
    X['Direction'] = X['Direction'].map(direction_mapping)
    X['Direction'].fillna(1, inplace=True)

    return X

def calculate_time_since_installation(X):
    X = X.copy()  # To avoid modifying the original DataFrame
    X['counter_installation_date'] = pd.to_datetime(X['counter_installation_date'])
    X['Time_since_installation'] = X['date'].dt.date - X['counter_installation_date'].dt.date
    X['Time_since_installation'] = X['Time_since_installation'].apply(lambda x: x.days)
    return X


def get_elevation_for_dataframe(X):
    # Define the function to get elevation
    def get_elevation(lat, lon):
        url = f"https://api.open-elevation.com/api/v1/lookup?locations={lat},{lon}"
        response = requests.get(url)
        if response.status_code == 200:
            elevation = response.json()['results'][0]['elevation']
            return elevation
        else:
            return None  # Return None or a default value if the API call fails

    # Get unique latitude-longitude pairs
    unique_lat_lon = X[['latitude', 'longitude']].drop_duplicates()

    # Get elevation for each unique pair
    unique_lat_lon['elevation'] = unique_lat_lon.apply(lambda row: get_elevation(row['latitude'], row['longitude']), axis=1)

    # Create a dictionary for mapping
    elevation_dict = dict(zip(unique_lat_lon['latitude'], unique_lat_lon['elevation']))

    # Map the elevation to the DataFrame
    X['Elevation'] = X['latitude'].map(elevation_dict)

    return X

def column_to_drop(X):
    #return X.drop(['date','longitude','latitude','counter_technical_id','counter_id','site_id','counter_installation_date','coordinates', 'average_wind_direction' , 'average_wind_speed','horizontal_visibility','humidity','Temperature'],axis=1)
    return X.drop(['date','longitude','latitude','counter_technical_id','counter_id','site_id','counter_installation_date','coordinates','counter_name','site_name'],axis=1)

def comprehensive_preprocessing(X):
    X = _encode_dates(X)
    X = encode_week_end(X)
    X = encode_season(X)
    X=apply_confinement_status(X)
    X=split_address_and_map_direction(X)
    X=calculate_time_since_installation(X)
    X=get_elevation_for_dataframe(X)
    X = column_to_drop(X)
   
    print(X.info())
   
    return X



class DataCaptureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.data = None

    def fit(self, X, y=None):
        return self  # nothing to fit

    def transform(self, X):
        self.data = X.copy()  # store the data
        return X  # pass the data along unchanged


def get_estimator():
# Now use this function in FunctionTransformer
    
    data_encoder = FunctionTransformer(comprehensive_preprocessing)

    categorical_encoder = OneHotEncoder(handle_unknown="ignore")
    categorical_cols = ['season','Address','Week_day']
    
    standardize_cols = ['Temperature','average_wind_speed']
    #['t', 'dd' , 'ff','vv','u','Elevation','Time_since_installation','log_bike_count_-1', 'log_bike_count_-2']
    standardizer = StandardScaler()

    preprocessor = ColumnTransformer(

        [("cat", categorical_encoder, categorical_cols),
         
          ("stand", standardizer, standardize_cols)
         
        ],
    remainder='passthrough'

    )

    #regressor =RandomForestRegressor()
    regressor = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=5)
    
    data_capture_step = DataCaptureTransformer()

 

    pipe = make_pipeline(data_encoder,preprocessor,data_capture_step , regressor)

    return pipe

df_train =pd.read_parquet("../input/mdsb-2023/train.parquet")
df_test = pd.read_parquet("../input/mdsb-2023/final_test.parquet")
weather=pd.read_csv("../input/mdsb-2023/external_data.csv",sep='or|,+', engine='python')
weather=weather[['date','t','dd','ff','vv','u']]
new_name= {'t' : 'Temperature','dd' : 'average_wind_direction' , 'ff' : 'average_wind_speed','vv' : 'horizontal_visibility' , 'u' : 'humidity'}
weather.rename(columns=new_name,inplace=True)


## ADD MERGE OF PONT


weather['date']=pd.to_datetime(weather['date'])
# Define your date range
start_date = datetime.strptime('2020-09-01 01:00:00', "%Y-%m-%d %H:%M:%S")
end_date = datetime.strptime('2021-09-09 23:00:00', "%Y-%m-%d %H:%M:%S")
# Filter the DataFrame
weather_train = weather[(start_date <= weather['date']) & (weather['date'] <= end_date)]
weather_train = weather_train.copy()
weather_train.drop(2018, inplace=True)
weather_train = weather_train.set_index('date')
# Resample to hourly and forward fill the missing values
df_hourly = weather_train.resample('H').ffill()
# Reset index if you want 'date' back as a column
weather_hourly = df_hourly.reset_index()

df_train.reset_index(inplace=True)

df_train = pd.merge(df_train,weather_hourly,on='date' ) 


df_train.set_index('index',inplace=True)

df_train= df_train.sort_index()

# Extract features and target

X_train = df_train.drop(['log_bike_count','bike_count'],axis=1)
y_train = df_train['log_bike_count']

pipeline = get_estimator()

pipeline.fit(X_train, y_train)


df_test = pd.read_parquet("../input/mdsb-2023/final_test.parquet")

pd.to_datetime(df_test['date'])

start_date = min(df_test['date'])
end_date = max(df_test['date'])
# Filter the DataFrame
weather_test = weather[(start_date <= weather['date']) & (weather['date'] <= end_date)]

weather_test = weather_test.set_index('date')
row_to_copy = weather_test.loc['2021-09-10 03:00:00']

# Create new rows with the same values but different indices
new_rows = pd.DataFrame([row_to_copy, row_to_copy], 
                        index=pd.to_datetime(['2021-09-10 02:00:00', '2021-09-10 01:00:00']))

weather_test = pd.concat([weather_test, new_rows])

# Sort the DataFrame by index
weather_test.sort_index(inplace=True)
# Resample to hourly and forward fill the missing values
df_hourly = weather_test.resample('H').ffill()
# Reset index if you want 'date' back as a column

weather_hourly = df_hourly.reset_index().rename(columns={'index': 'date'})

df_test.reset_index(inplace=True)

print(df_test[df_test['date']==datetime.strptime('2021-09-12 14:00:00', "%Y-%m-%d %H:%M:%S")].index)

df_test= pd.merge(df_test,weather_hourly,on='date' ) 

df_test.set_index('index',inplace=True)
df_test= df_test.sort_index()

print(df_test[df_test['date']==datetime.strptime('2021-09-12 14:00:00', "%Y-%m-%d %H:%M:%S")].index)
X_test = df_test



y_pred = pipeline.predict(X_test)

 

# Create submission file

results = pd.DataFrame({

    'Id': np.arange(y_pred.shape[0]),

   'log_bike_count': y_pred

})

results.to_csv("submission.csv", index=False)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session