In [11]:
'''
pymongoarrow offers several advantages over using $project with the aggregate() function in pymongo when building a DataFrame:

    Performance: pymongoarrow can significantly improve the performance of transferring data from MongoDB to pandas by using the Apache Arrow format. Apache Arrow is a high-performance in-memory data format optimized for analytical processing. It minimizes the need for data serialization and deserialization, allowing for much faster data transfer between MongoDB and pandas.

    Memory efficiency: By using Apache Arrow, pymongoarrow can reduce the memory overhead when working with large datasets. Arrow's columnar memory format enables more efficient memory usage when transferring data between MongoDB and pandas. This can be particularly beneficial when working with large datasets, as it can reduce the overall memory footprint.

    Ease of use: When using $project with the aggregate() function, you need to manually convert the result into a pandas DataFrame. With pymongoarrow, the conversion is handled automatically, which simplifies the process and reduces the likelihood of errors.

    Flexibility: Although $project can be used for simple transformations and filtering, pymongoarrow offers a more flexible and efficient way of transferring data between MongoDB and pandas, especially when working with complex data structures or large datasets.

In summary, using pymongoarrow can provide better performance, memory efficiency, ease of use, and flexibility when working with pandas DataFrames, particularly when handling large datasets or complex data structures.


'''

"\npymongoarrow offers several advantages over using $project with the aggregate() function in pymongo when building a DataFrame:\n\n    Performance: pymongoarrow can significantly improve the performance of transferring data from MongoDB to pandas by using the Apache Arrow format. Apache Arrow is a high-performance in-memory data format optimized for analytical processing. It minimizes the need for data serialization and deserialization, allowing for much faster data transfer between MongoDB and pandas.\n\n    Memory efficiency: By using Apache Arrow, pymongoarrow can reduce the memory overhead when working with large datasets. Arrow's columnar memory format enables more efficient memory usage when transferring data between MongoDB and pandas. This can be particularly beneficial when working with large datasets, as it can reduce the overall memory footprint.\n\n    Ease of use: When using $project with the aggregate() function, you need to manually convert the result into a pandas Dat

In [12]:
import pyarrow
from pymongo import MongoClient
import pandas as pd
import pymongoarrow.monkey
import pymongoarrow.api as arrow_api
from pymongoarrow.api import Schema
from datetime import datetime
import pprint



# Replace 'your_connection_string' with your MongoDB connection string
client = MongoClient('mongodb+srv://mongodb:mongodb@cluster0.cycye.mongodb.net/?retryWrites=true&w=majority')
db = client.predictive_maintenance
collection = db.data_records_ts

# Show a sample document
pprint.pprint(collection.find_one())

pymongoarrow.monkey.patch_all()

# Specify the schema for the pandas DataFrame
schema = arrow_api.Schema({
    "time": pyarrow.timestamp('ms'),
    "temperature": float,
    "pressure": float,
    "vibration": float,
    "failure": int
})

# Define the aggregation pipeline to filter and sort the data based on the timestamp
pipeline = [
    {"$match": {"sensor_id": "sensor01"}},  # Filter data
    {"$sort": {"timestamp": 1}},  # Sort data by timestamp (ascending)
]

# Fetch data from MongoDB and load it into a pandas DataFrame
data = collection.aggregate_pandas_all(pipeline, schema=schema)

# Close the connection
client.close()

# Display the first 20 rows of the DataFrame
print(type(data))
print(data.head(20))


{'_id': ObjectId('6434741b1c905cf3c16638a6'),
 'failure': 0,
 'pressure': 991.9619681990291,
 'sensor_id': 'sensor01',
 'temperature': 187.49080237694724,
 'time': datetime.datetime(2020, 1, 1, 0, 0),
 'vibration': 52.848900060827184}
<class 'pandas.core.frame.DataFrame'>
                  time  temperature     pressure  vibration  failure
0  2020-01-01 00:00:00   187.490802   991.961968  52.848900        0
1  2020-01-01 00:01:00   199.014286  1005.859918  50.770524        0
2  2020-01-01 00:02:00   194.639879  1013.062805  52.740878        0
3  2020-01-01 00:03:00   191.973170   987.730843  50.326125        0
4  2020-01-01 00:04:00   183.120373  1017.123077  46.622408        0
5  2020-01-01 00:05:00   183.119890   998.714490  44.420324        0
6  2020-01-01 00:06:00   181.161672   990.249786  48.684986        0
7  2020-01-01 00:07:00   197.323523  1007.577548  46.168619        0
8  2020-01-01 00:08:00   192.022300   995.535455  49.401404        0
9  2020-01-01 00:09:00   194.161452  

In [13]:
import ipywidgets as widgets
from IPython.display import display
import matplotlib.pyplot as plt

def plot_variable(variable='temperature'):
    plt.figure(figsize=(12, 6))
    
    if variable == 'temperature':
        plt.plot(data['temperature'], color='red', label='Temperature', alpha=0.7)
        plt.ylabel('Temperature')
    elif variable == 'pressure':
        plt.plot(data['pressure'], color='yellow', label='Pressure', alpha=0.7)
        plt.ylabel('Pressure')
    elif variable == 'vibration':
        plt.plot(data['vibration'], color='blue', label='Vibration', alpha=0.7)
        plt.ylabel('Vibration')
    elif variable == 'failure':
        plt.plot(data['failure'], color='green', label='failure', alpha=0.7)
        plt.ylabel('Failure')
    
    plt.xlabel('Record Index')
    plt.legend()
    plt.title(f'{variable.capitalize()} Data')
    plt.show()

# Create a toggle switch for the three variables
variable_selector = widgets.Dropdown(
    options=['temperature', 'pressure', 'vibration', 'failure'],
    value='temperature',
    description='Variable:',
)

# Display the interactive chart with a toggle switch
widgets.interact(plot_variable, variable=variable_selector)


interactive(children=(Dropdown(description='Variable:', options=('temperature', 'pressure', 'vibration', 'fail…

<function __main__.plot_variable(variable='temperature')>

In [14]:
import plotly.graph_objects as go

# Create a function to generate the interactive chart with left-right scroll
def plot_scroll_chart():
    fig = go.Figure()

    # Add traces for temperature, pressure, and vibration
    fig.add_trace(go.Scatter(x=data.index, y=data['temperature'], name='Temperature', line=dict(color='red')))

    # Customize the layout
    fig.update_layout(
        title='Temperature Data',
        xaxis=dict(
            rangeslider=dict(visible=True),  # Add a rangeslider for scrolling
            type='linear'
        ),
        yaxis=dict(title='Value'),
        legend=dict(orientation='h', yanchor='bottom', xanchor='right', y=1.02, x=1)
    )

    # Set the initial range to display 1000 records per 100 pixels
    fig.update_xaxes(range=[0, 1000])

    # Show the chart
    fig.show()

# Call the function to create the chart
plot_scroll_chart()


In [15]:
import pandas as pd

# Calculate the correlation matrix
correlation_matrix = data.corr()

# Print the correlation matrix
print(correlation_matrix)

# Get the correlation coefficients for the 'failure' column
failure_correlation = correlation_matrix['failure']

# Rank the features based on the absolute value of their correlation with 'failure'
ranked_features = failure_correlation.reindex(failure_correlation.abs().sort_values(ascending=False).index)

# Print the ranked features
print("\nRanked features based on correlation with equipment failure:")
print(ranked_features)

                 time  temperature  pressure  vibration   failure
time         1.000000     0.025948 -0.015010  -0.004761  0.045763
temperature  0.025948     1.000000 -0.004897  -0.008275  0.114143
pressure    -0.015010    -0.004897  1.000000   0.005827  0.007140
vibration   -0.004761    -0.008275  0.005827   1.000000  0.006315
failure      0.045763     0.114143  0.007140   0.006315  1.000000

Ranked features based on correlation with equipment failure:
failure        1.000000
temperature    0.114143
time           0.045763
pressure       0.007140
vibration      0.006315
Name: failure, dtype: float64


In [16]:
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# Prepare the feature matrix (X) and the target vector (y)
X = data.drop('failure', axis=1)
X = data.drop('time', axis=1)
y = data['failure']

# Create an estimator for RFE (using Logistic Regression in this example)
estimator = LogisticRegression()

# Perform RFE to rank and select the most informative features
selector = RFE(estimator, n_features_to_select=1, step=1)
selector = selector.fit(X, y)

# Create a Series to map feature names to their RFE rankings
ranked_features = pd.Series(selector.ranking_, index=X.columns)

# Sort the features based on their RFE rankings
ranked_features = ranked_features.sort_values()

# Print the ranked features
print("Ranked features based on recursive feature elimination:")
print(ranked_features)

Ranked features based on recursive feature elimination:
failure        1
temperature    2
pressure       3
vibration      4
dtype: int32


In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib

# Assuming 'data_from_db' is the pandas DataFrame loaded from MongoDB

# Separate features and target variable
X = data[['temperature', 'pressure', 'vibration']]
y = data['failure']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate the model on the testing set
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, zero_division=1))

# Save the model for deployment
joblib.dump(model, 'logistic_regression_model.pkl')

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      5853
           1       1.00      0.00      0.00       147

    accuracy                           0.98      6000
   macro avg       0.99      0.50      0.49      6000
weighted avg       0.98      0.98      0.96      6000



['logistic_regression_model.pkl']

In [18]:
import numpy as np

# Input features for the model
input_data = np.array([[250, 1000, 50]])

# Predict the probability of equipment failure
loaded_model = joblib.load('logistic_regression_model.pkl')
probability = loaded_model.predict_proba(input_data)[:, 1]

print(f"Probability of equipment failure at 220 degrees: {probability[0]:.4f}")

pprint.pprint(probability)

Probability of equipment failure at 220 degrees: 0.9603
array([0.96032037])



X does not have valid feature names, but LogisticRegression was fitted with feature names

