In [1]:
import os

import pandas as pd
from joblib import dump

from logistic_regression_model.data_prep.data_processing import (
    data_processing,
    rename_class,
    split_dataset,
)
from logistic_regression_model.evaluation.model_evaluation import (
    confusion_matrix,
    cross_validation,
)
from logistic_regression_model.modelling_evaluation.build_models import (
    logistic_reg_model,
)
from logistic_regression_model.parameters.project_parameters import (
    training_dataset_path,
    save_models,
)


### Build Model

In [2]:
# scrape data
print("Loading data")
dataset_combined = pd.read_csv(training_dataset_path)

# process_datasets
print("Processing data for machine learning...")
dataset_combined = rename_class(dataset_combined)
X_vec, y = data_processing(dataset_combined)

# Split into training and testing (0.3 for testing)
X_train, X_test, y_train, y_test = split_dataset(X_vec, y, 0.3)
print("Data processing complete")

# Build machine learning models
print("Building machine learning model...")
log_reg = logistic_reg_model(X_train, y_train)

# cross validation
print("Performing stratified cross-validation...")
models = ("Logistic Reg (baseline)", log_reg)
cross_validation(models, X_train, y_train, 10)

# confusion matrix
print("Saving confusion matrix...")
#confusion_matrix(log_reg, X_train, y_train, "logistic_regression")

# Save model to saved_models folder
#print("Saving model")
#pkl_filename = "final_model.joblib"
#with open(os.path.join(save_models, pkl_filename), "wb") as file:
 #   dump(log_reg, file)


Loading data
Processing data for machine learning...
Data processing complete
Building machine learning model...
Performing stratified cross-validation...
['Logistic Reg (baseline)']: mean: 0.788971 std dev: (0.040347)
Saving confusion matrix...


### Predict New Instances 

In [1]:
import pickle
import typing as t
from typing import List, Optional, Tuple

import joblib
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

#from logistic_regression_model.data_prep.validation import validate_inputs
from logistic_regression_model.parameters.project_parameters import (
    file_path_model,
    file_path_tfidf,
    file_path_vec,
)

In [2]:
# Load trained model and vocabs to make predictions
trained_model = joblib.load(filename=file_path_model)
loaded_vec = CountVectorizer(vocabulary=pickle.load(open(file_path_vec, "rb")))
loaded_tfidf = pickle.load(open(file_path_tfidf, "rb"))

In [37]:
def make_prediction(
    *,
    input_data: t.Union[pd.DataFrame, dict],
) -> dict:
    """ Makes a prediction for dataframe of headlines"""

    # Make sure data is dataframe
    data = pd.DataFrame(input_data, columns={"Headline"})

    # Validate feature
    validated_data, errors = validate_inputs(input_data=data)

    # Placeholder for predictions
    results = {"predictions": None, "errors": errors}

    if not errors:
        validated_data_headlines = validated_data["Headline"].values.tolist()
        X_new_counts = loaded_vec.transform(validated_data_headlines)
        X_new_tfidf = loaded_tfidf.transform(X_new_counts)
        predictions = list(trained_model.predict(X_new_tfidf))
        #predictions = predictions[0]

        results = {"predictions": predictions, "errors": errors}

    return results

In [38]:
def validate_inputs(*, input_data: pd.DataFrame) -> Tuple[pd.DataFrame, Optional[dict]]:
    
    validated_data = input_data
    errors = None

    try:
        # replace numpy nans so that pydantic can validate
        MultipleHouseDataInputs(
            inputs=validated_data.replace({np.nan: None}).to_dict(orient="records")
        )
    except ValidationError as error:
        errors = error.json()

    return validated_data, errors

In [39]:
from pydantic import BaseModel, ValidationError
from typing import List, Optional, Tuple

In [40]:
class HouseDataInputSchema(BaseModel):
    Headline: Optional[str]

In [41]:
class MultipleHouseDataInputs(BaseModel):
    inputs: List[HouseDataInputSchema]
        
    class Config:
        schema_extra = {
            "example": {
                "inputs": [
                    {
                        "Headline": "Englands Alexander-Arnold out of Euro 2020"
                        
                    } 
                    ]
                
            }
        }

In [42]:
input_data_example = ["England win the Euros"]

In [43]:
from fastapi.encoders import jsonable_encoder

In [44]:
input_df = pd.DataFrame(jsonable_encoder(input_data_example), columns = {"Headline"})

In [45]:
input_df

Unnamed: 0,Headline
0,England win the Euros


In [46]:
results = make_prediction(input_data=input_df)

In [47]:
results

{'predictions': ['cricket'], 'errors': None}

In [48]:
type(results['predictions'])

list

In [49]:
type(results)

dict

In [21]:
results

{'predictions': 'cricket', 'errors': None}

### Predict for FastAPI 

In [2]:
import requests

In [1]:
headline = {
  "inputs": [
    {
      "Headline": "f1 is good"
    }
  ]
}

In [153]:
response = requests.post("http://localhost:8001/api/v1/predict/", json=headline)

In [154]:
response.json()

{'errors': None, 'predictions': ['f1']}

### Predict for Heroku 

In [3]:
response = requests.post("https://floating-anchorage-18633.herokuapp.com/api/v1/predict/", json=headline)

In [4]:
response.json()

{'errors': None, 'predictions': ['f1']}