<a href="https://colab.research.google.com/github/ayu-shiirathore/API_Prediction_Model/blob/main/API_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas scikit-learn openpyxl




In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error

# Load the dataset
file_path = "API_Call_Dataset.csv"  # Update with the actual file path
data = pd.read_csv("/content/API Call Dataset.csv", encoding='latin1')  # Handle potential encoding issues

# Step 1: Identify the top four most frequently called APIs
top_apis = data['API Code'].value_counts().head(4).index.tolist()

# Step 2: Prepare data for modeling
def prepare_data(api_data, time_span_minutes):
    # Convert 'Time of Call' to datetime
    api_data['Time of Call'] = pd.to_datetime(
        api_data['Time of Call'],
        format='%d-%m-%Y %H:%M',  # Adjust format if necessary
        errors='coerce'
    )
    api_data = api_data.dropna(subset=['Time of Call'])  # Drop invalid timestamps

    api_data['Hour'] = api_data['Time of Call'].dt.hour
    api_data['Minute'] = api_data['Time of Call'].dt.minute
    api_data['Time Since Last Call'] = api_data['Time of Call'].diff().dt.total_seconds().div(60).fillna(0)

    # Target: Number of calls in the user-defined time span
    api_data[f'Next {time_span_minutes} Min Calls'] = api_data.apply(
        lambda row: ((api_data['Time of Call'] > row['Time of Call']) &
                     (api_data['Time of Call'] <= row['Time of Call'] + pd.Timedelta(minutes=time_span_minutes))).sum(), axis=1
    )

    # Keep only relevant columns
    api_data = api_data[['Hour', 'Minute', 'Time Since Last Call', f'Next {time_span_minutes} Min Calls']]
    return api_data

# Step 3: Train models for each API
def train_models(api_data, time_span_minutes):
    target_col = f'Next {time_span_minutes} Min Calls'
    X = api_data.drop(columns=[target_col])
    y = api_data[target_col]

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define models
    model_dict = {
        "Random Forest": RandomForestRegressor(random_state=42),
        "Gradient Boosting": GradientBoostingRegressor(random_state=42),
        "Linear Regression": LinearRegression(),
        "Decision Tree": DecisionTreeRegressor(random_state=42),
        "Support Vector Regressor": SVR()
    }

    # Train and evaluate all models
    api_results = {}
    best_model = None
    best_mae = float('inf')

    for model_name, model in model_dict.items():
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        mae = mean_absolute_error(y_test, predictions)
        api_results[model_name] = mae

        # Track the best model
        if mae < best_mae:
            best_mae = mae
            best_model = model

    return best_model, api_results

# Step 4: User input for time span prediction (in minutes)
time_span = int(input("Enter the time span after which you want to predict the number of API calls (in minutes): "))

# Step 5: Create separate Excel files for each top API and train models
best_models = {}
for api in top_apis:
    # Filter the data for the API and save to an Excel file
    api_data = data[data['API Code'] == api]
    api_data_file = f"{api}_API_Data.xlsx"
    api_data.to_excel(api_data_file, index=False)
    print(f"Excel file created for API {api}: {api_data_file}")

    # Prepare the data for training
    api_data_prepared = prepare_data(api_data, time_span)

    # Train models and choose the best one for this API
    best_model, api_results = train_models(api_data_prepared, time_span)
    best_models[api] = best_model
    print(f"\nAPI {api} - Model Performance (MAE): {api_results}")
    print(f"Best model for API {api}: {best_model}")

# Step 6: Prediction based on user-defined time span
for api, model in best_models.items():
    print(f"\nPrediction for API {api}:")

    # Calculate the current time
    current_time = pd.Timestamp.now()

    # Calculate the prediction time by adding the user-defined time span to the current time
    prediction_time = current_time + pd.Timedelta(minutes=time_span)

    print(f"Prediction time: {prediction_time}")

    # Calculate 'Time Since Last Call' using the calculated prediction time
    api_data = data[data['API Code'] == api]
    api_data['Time of Call'] = pd.to_datetime(api_data['Time of Call'], format='%d-%m-%Y %H:%M', errors='coerce')
    last_call_time = api_data['Time of Call'].max()
    time_since_last_call = (prediction_time - last_call_time).total_seconds() / 60

    # Prepare the input feature vector
    input_data = pd.DataFrame({
        'Hour': [prediction_time.hour],
        'Minute': [prediction_time.minute],
        'Time Since Last Call': [time_since_last_call]
    })

    # Predict using the best model
    prediction = model.predict(input_data)

    # Round the prediction to the nearest integer
    prediction_rounded = round(prediction[0])  # Round to nearest integer
    print(f"Predicted number of calls for API {api} in the next {time_span} minutes: {prediction_rounded}")
    print(f"Model used for prediction: {type(model).__name__}")


Enter the time span after which you want to predict the number of API calls (in minutes): 780
Excel file created for API A9: A9_API_Data.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  api_data['Time of Call'] = pd.to_datetime(



API A9 - Model Performance (MAE): {'Random Forest': 2.3981072640868972, 'Gradient Boosting': 2.355084581716622, 'Linear Regression': 2.337941702716507, 'Decision Tree': 3.0814663951120163, 'Support Vector Regressor': 2.3410577407520416}
Best model for API A9: LinearRegression()
Excel file created for API A2: A2_API_Data.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  api_data['Time of Call'] = pd.to_datetime(



API A2 - Model Performance (MAE): {'Random Forest': 2.4673237704918036, 'Gradient Boosting': 2.3846203013992953, 'Linear Regression': 2.3331176673301264, 'Decision Tree': 3.2315573770491803, 'Support Vector Regressor': 2.3361262880002225}
Best model for API A2: LinearRegression()
Excel file created for API A7: A7_API_Data.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  api_data['Time of Call'] = pd.to_datetime(



API A7 - Model Performance (MAE): {'Random Forest': 2.1858924619640385, 'Gradient Boosting': 2.0676712607885817, 'Linear Regression': 2.0428876393612714, 'Decision Tree': 2.8568464730290457, 'Support Vector Regressor': 2.034609333792581}
Best model for API A7: SVR()
Excel file created for API A4: A4_API_Data.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  api_data['Time of Call'] = pd.to_datetime(



API A4 - Model Performance (MAE): {'Random Forest': 2.4948236313236314, 'Gradient Boosting': 2.3749759474996077, 'Linear Regression': 2.360240037141744, 'Decision Tree': 3.3762993762993765, 'Support Vector Regressor': 2.352841960386524}
Best model for API A4: SVR()

Prediction for API A9:
Prediction time: 2024-11-24 18:01:37.665963
Predicted number of calls for API A9 in the next 780 minutes: 307
Model used for prediction: LinearRegression

Prediction for API A2:
Prediction time: 2024-11-24 18:01:37.680691
Predicted number of calls for API A2 in the next 780 minutes: 556
Model used for prediction: LinearRegression

Prediction for API A7:
Prediction time: 2024-11-24 18:01:37.696195
Predicted number of calls for API A7 in the next 780 minutes: 8
Model used for prediction: SVR

Prediction for API A4:
Prediction time: 2024-11-24 18:01:37.710827
Predicted number of calls for API A4 in the next 780 minutes: 8
Model used for prediction: SVR


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  api_data['Time of Call'] = pd.to_datetime(api_data['Time of Call'], format='%d-%m-%Y %H:%M', errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  api_data['Time of Call'] = pd.to_datetime(api_data['Time of Call'], format='%d-%m-%Y %H:%M', errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-v