In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt 
%matplotlib inline
import seaborn as sns
import pandas as pd
import statsmodels.api as sm
sns.set_style("darkgrid")
mpl.rcParams['figure.figsize'] = (20,5)

In [None]:
dataframe_raw = pd.read_csv("DF_Raw_Data.csv")
dataframe_stdev = pd.read_csv("DF_Rolling_Stdev.csv")

In [None]:
print(dataframe_raw.describe())
print(dataframe_raw.info())
print(dataframe_stdev.describe())
print(dataframe_stdev.info())

In [None]:
#DIFFERENTIAL STATISTICS

#Visualize the data from each dataset
dataframe_raw.plot(kind='box').set_title("Raw Dataframe Box Plot")

dataframe_raw.plot(kind='line').set_title("Raw Dataframe Line Plot")
plt.legend(bbox_to_anchor=(1.15, 1), loc='upper right', borderaxespad=0)

plt.show()

In [None]:
dataframe_stdev.plot(kind='box').set_title("Stdev Dataframe Box Plot")

dataframe_stdev.plot(kind='line').set_title("Stdev Dataframe Line Plot")
plt.legend(bbox_to_anchor=(1.15, 1), loc='upper right', borderaxespad=0)

plt.show()

#dataframe_stdev has many more outliers than dataframe_raw, while dataframe_raw has a larger IQR. Regarding the line plots,
#the two dataframes act in opposite each other. When the variables in dataframe_raw spike high, they drop low in
#dataframe_stdev.

In [None]:
#Visualize when the pump is working and when it is not working for both datasets
RPF0 = dataframe_raw["PUMP FAILURE (1 or 0)"]== 0
d_raw0 = dataframe_raw[RPF0]
d_raw0.plot(kind='box')
plt.title("Raw Dataframe Pump Working")

RPF1 = dataframe_raw["PUMP FAILURE (1 or 0)"]== 1
d_raw1 = dataframe_raw[RPF1]
d_raw1.plot(kind='box')
plt.title("Raw Dataframe Pump Failed")

SPF0 = dataframe_stdev["PUMP FAILURE (1 or 0)"]== 0
d_stdev0 = dataframe_stdev[SPF0]
d_stdev0.plot(kind='box')
plt.title("Standard Deviation Dataframe Pump Working")

SPF1 = dataframe_stdev["PUMP FAILURE (1 or 0)"]== 1
d_stdev1 = dataframe_stdev[SPF1]
d_stdev1.plot(kind='box')
plt.title("Standard Deviation Dataframe Pump Failed")

plt.show()

#While looking at the raw dataframe, when the pump fails the Pump Speed (RPM), Pump Torque and Ambient Temperature 
#values show a slight upwards shift. All variables display an increase in their IQR except Ambient Temperature and 
#Horse Power.

#While looking at the stdev dataframe, when the pump fails we see the values among all variables spread out more evenly 
#throughought their ranges, making for increased upper quartiles and displaying no noticeable outliers. When the pump is 
#working there are many outliers, and the upper quartiles are closer to the lower ones leaving small IQRs.

In [None]:
#Create quartiles
Raw_Q1 = dataframe_raw.quantile(0.25)
Raw_Q3 = dataframe_raw.quantile(0.75)
Raw_IQR = Raw_Q3 - Raw_Q1
Raw_IQR

In [None]:
#Identify Outliers
Lower_Limit = Raw_Q1 - 1.5*Raw_IQR
Upper_Limit = Raw_Q3 + 1.5*Raw_IQR

Outliers = dataframe_raw[((dataframe_raw < Lower_Limit) | ((dataframe_raw > Upper_Limit))).any(axis=1)]
print(len(Outliers))

percentage_of_nonoutliers = 1 - (len(Outliers)/len(dataframe_raw))
print(percentage_of_nonoutliers)

Outliers.describe()

#In this dataset, we see that there were a total of 95 outliers, leaving 96% of our data to work with. In this case it would
#be ok to remove the outliers, especially since they tend to skew analysis. By removing them, our analysis will become more 
#"significant". Of course, removing outliers will display more of a normal relationship within a variable's values. 

In [None]:
#Visualize without outliers
no_outliers = dataframe_raw[~((dataframe_raw < Lower_Limit) | ((dataframe_raw > Upper_Limit ))).any(axis=1)]

no_outliers[no_outliers['PUMP FAILURE (1 or 0)']==0].plot(kind='box')
plt.title("All Outliers Removed (Pump Failure = 0 )")
plt.show()

no_outliers[no_outliers['PUMP FAILURE (1 or 0)']==1].plot(kind='box')
plt.title("All Outliers Removed (Pump Failure = 1 )")
plt.show()

In [None]:
#Loop through raw dataset, plot each variable against pump failure
List_Of_Variables = ['Volumetric Flow Meter 1',
       'Volumetric Flow Meter 2', 'Pump Speed (RPM)', 'Pump Torque ',
       'Ambient Temperature', 'Horse Power', 'Pump Efficiency']

dataframe_raw.set_index('TIMEFRAME (DD/MM/YYYY)', inplace=True)

for i in List_Of_Variables:
    first_axis = dataframe_raw[i].plot()
    first_axis.xaxis.set_major_locator(plt.MaxNLocator(10))
    second_axis = first_axis.twinx()
    second_axis.plot(dataframe_raw['PUMP FAILURE (1 or 0)'], color='teal')
    second_axis.xaxis.set_major_locator(plt.MaxNLocator(10))
    plt.title(i)
    plt.show()
    
#The largest spikes and drops in each variables' values occur during pump failure. 
#The Pump Speed (RPM), Ambient Temperature and Horsepower variables seem to be most afected by the pump failure.

In [None]:
#Replot time period 10/12/2014 12:00 to 10/12/2014 14:30 and set TIMEFRAME

List_Of_Variables = ['Volumetric Flow Meter 1', 'Volumetric Flow Meter 2', 'Pump Speed (RPM)', 'Pump Torque ',
                     'Ambient Temperature', 'Horse Power', 'Pump Efficiency']

dataframe_stdev.set_index('TIMEFRAME (DD/MM/YYYY)', inplace=True)
dataframe_stdev = dataframe_stdev[(dataframe_stdev.index>= "10/12/2014 12:00") & (dataframe_stdev.index<="10/12/2014 14:30")]

for i in List_Of_Variables:
    first_axis = dataframe_stdev[i].plot()
    first_axis.xaxis.set_major_locator(plt.MaxNLocator(10))
    second_axis = first_axis.twinx()
    second_axis.plot(dataframe_stdev['PUMP FAILURE (1 or 0)'], color='teal')
    second_axis.xaxis.set_major_locator(plt.MaxNLocator(10))
    plt.title(i)
    plt.show()

In [None]:
#INFERENTIAL STATISTICS

#Show correlation of all variables in raw dataset
sns.heatmap(dataframe_raw.corr(), annot=True)
plt.title("Correlated Raw Heatmap")
plt.show()

#Upon observing the heatmap, Horse Power is most strongly correlated with pump failure.

In [None]:
dataframe_raw.corr()

In [None]:
#Plot the variables correlations
dataframe_raw.corr()['PUMP FAILURE (1 or 0)'].sort_values(ascending=False).plot(kind='bar')
plt.title("Correlated Bar Plot(Raw Data)")
plt.show()

In [None]:
#Visualize correlations of each variable in stdev dataset

sns.heatmap(dataframe_stdev.corr(), annot=True)
plt.title("Correlated Stdev Heatmap")
plt.show()

#Pump Speed (RPM), Pump Torque and Ambient Temperature appear to be very closly correlated to pump failure.

In [None]:
#Multivariate Regression Model

#Dataframe_Raw
dataframe_raw = pd.read_csv("DF_Raw_Data.csv")

independent_variables = dataframe_raw[['Volumetric Flow Meter 1', 'Volumetric Flow Meter 2', 'Pump Speed (RPM)', 
                                       'Pump Torque ', 'Ambient Temperature', 'Horse Power', 'Pump Efficiency']]

dependent_variables = dataframe_raw['PUMP FAILURE (1 or 0)']

independent_variables = sm.add_constant(independent_variables)

regression_model = sm.OLS(dependent_variables, independent_variables).fit()
print(regression_model.summary())

In [None]:
#Multivariate Regression Model

#Dataframe_Stdev
dataframe_stdev = pd.read_csv("DF_Rolling_Stdev.csv")
independent_variables = dataframe_stdev[['Volumetric Flow Meter 1', 'Volumetric Flow Meter 2', 'Pump Speed (RPM)', 
                                       'Pump Torque ', 'Ambient Temperature', 'Horse Power', 'Pump Efficiency']]

dependent_variables = dataframe_stdev['PUMP FAILURE (1 or 0)']

independent_variables = sm.add_constant(independent_variables)

regression_model = sm.OLS(dependent_variables, independent_variables).fit()
print(regression_model.summary())

In [None]:
#With the raw dataset having an R-squared of 0.362 and the stdev dataset having and R-squared of 0.778, we can see that the
#regression model of the stdev dataset is going to fit our our data better. The rule is, in general, the higher the 
#R-squared, the better the model fits your data. Our data is closer to the fitted regression line in the stdev dataset.

In [None]:
#Extract coefficients from regression model and create a bar plot that identifies which coefficients react most 
#strongly with respect to Pump Failure.

regression_model.params.sort_values(ascending=False).plot(kind='bar')
plt.title("Regressive Coefficients for Pump Failure")
plt.show()

In [None]:
#Validate Predictions

#Create Prediction column and apply the .predict() function to the independent variables
dataframe_stdev['Prediction'] = pd.DataFrame(regression_model.predict(independent_variables))
dataframe_stdev.set_index("TIMEFRAME (DD/MM/YYYY)", inplace=True)
primary_axes = dataframe_stdev['Volumetric Flow Meter 2'].plot(alpha=0.7)
primary_axes = dataframe_stdev['Pump Efficiency'].plot(alpha=0.7)
primary_axes = dataframe_stdev['Horse Power'].plot(alpha=0.7)
primary_axes.xaxis.set_major_locator(plt.MaxNLocator(10))
second_axes = primary_axes.twinx()
second_axes.plot(dataframe_stdev['Prediction'], color='purple', alpha=0.6, label='Prediction')
second_axes.plot(dataframe_stdev['PUMP FAILURE (1 or 0)'], color='Black', linewidth=2, label='Pump Failure')
second_axes.xaxis.set_major_locator(plt.MaxNLocator(10))
second_axes.legend()
primary_axes.legend(bbox_to_anchor=(1.04,1), loc="upper left")
plt.title("Regressive Equation Plot")
plt.show()