In [1]:
# for reading in data
import pandas as pd

# data preprocessing
from pvops.text.preprocess import text_remove_numbers_stopwords
from nltk import corpus

# machine learning pipeline: vectorizer, model definitions, and scoring
from pvops.text.classify import classification_deployer
from pvops.text.defaults import supervised_classifier_defs
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, make_scorer

### Read in data

First, we'll read in the data and take a look at it. We would like to identify inverter-related records, so our column of interest is `Asset`.

In [2]:
CM_df = pd.read_csv('example_data/example_CM_records.csv')
CM_df['Asset'].head(10)

0                    Inverter
1            Central Inverter
2                         NaN
3                    Combiner
4                       Plant
5                    Inverter
6                    Combiner
7    Inverter/String Inverter
8                    Inverter
9      Ground-Mount PV System
Name: Asset, dtype: object

We can see that some records have a missing `Asset` field. Let's see how many records are missing it.

In [3]:
count_null = CM_df['Asset'].isnull().sum()
print(f'{count_null}/{len(CM_df)} records have a missing asset ({count_null/len(CM_df)*100 :.1f}%).')

39/500 records have a missing asset (7.8%).


We would like to leverage as much of the data as we can for later analysis, so it would help to find a way to gap-fill these records that are missing the asset. We can do so by leveraging the plain-text event description in the `GeneralDesc` column. Let's pull out the data of interest.

In [4]:
# pull out the Asset and GeneralDesc columns, and any rows with nulls in either column
CM_nonnull = CM_df[['Asset','GeneralDesc']].dropna()
CM_nonnull.head(5)

Unnamed: 0,Asset,GeneralDesc
0,Inverter,Saint Albans - String inverter 3-1-18 - Forced...
1,Central Inverter,Mod 1.7 offline Update 10/16: Alarm on AlsoEne...
3,Combiner,WO-00006820 ; Sunfish ; Combiner Box - Underpe...
4,Plant,Pearl II - Site - Forced outage. 8:10am 8-Dec ...
5,Inverter,Adirondack - Inverter 3.10 - Forced Outage. 7/...


For good measure, we can also look to see how many of the non-null records have an inverter-related asset:

In [5]:
count_inverter = CM_nonnull['Asset'].str.contains('Inverter').sum()
print(f'{count_inverter}/{len(CM_nonnull)} records have an inverter-related asset ({count_inverter/len(CM_nonnull)*100 :.1f}%).')

165/461 records have an inverter-related asset (35.8%).


### Naive approach: keyword method

As a first approach, we can perform a simple keyword-based method. We make a prediction of the asset based on whether the word "inverter" is present in the description.

In [6]:
# check if the event description contains the word 'inverter'; this is our prediction
prediction = CM_nonnull['GeneralDesc'].str.lower().str.contains('inverter')
# check if the asset itself contains the word 'inverter'; this is the true label
true_label = CM_nonnull['Asset'].str.lower().str.contains('inverter')

naive_num_correct = (prediction == true_label).sum()
print(f'{naive_num_correct}/{len(CM_nonnull)} records predicted correctly from keyword search '
      + f'({naive_num_correct/len(CM_nonnull)*100 :.1f}%).')

385/461 records predicted correctly from keyword search (83.5%).


This already does decently well. Let's see if we can improve our results via a more robust machine learning approach.

### Prepare data for supervised classification

First, we need to clean our text data. We can use `nltk`'s english stopwords list to remove common words that won't have much bearing on the true content of the records. All we need to do is pass this into the `text_remove_numbers_stopwords()` function from `pvops.text.preprocess`, which will handle all of this for us.

From there, we can make everything lowercase, and we're basically done. 

In [7]:
# clean the event description
stopwords = corpus.stopwords.words('english')
CM_nonnull['GeneralDescCleaned'] = CM_nonnull.apply(lambda x: text_remove_numbers_stopwords(x['GeneralDesc'], stopwords),
                                                    axis='columns')

# set event description to be lowercase
CM_nonnull['GeneralDescLower'] = CM_nonnull['GeneralDescCleaned'].str.lower()

CM_nonnull.head(5)

Unnamed: 0,Asset,GeneralDesc,GeneralDescCleaned,GeneralDescLower
0,Inverter,Saint Albans - String inverter 3-1-18 - Forced...,Saint Albans String inverter Forced outage com...,saint albans string inverter forced outage com...
1,Central Inverter,Mod 1.7 offline Update 10/16: Alarm on AlsoEne...,Mod offline Update Alarm AlsoEnergy PM indicat...,mod offline update alarm alsoenergy pm indicat...
3,Combiner,WO-00006820 ; Sunfish ; Combiner Box - Underpe...,WO ; Sunfish ; Combiner Box Underperformance ;...,wo ; sunfish ; combiner box underperformance ;...
4,Plant,Pearl II - Site - Forced outage. 8:10am 8-Dec ...,Pearl II Site Forced outage Dec Pearl II tripp...,pearl ii site forced outage dec pearl ii tripp...
5,Inverter,Adirondack - Inverter 3.10 - Forced Outage. 7/...,Adirondack Inverter Forced Outage Adirondack I...,adirondack inverter forced outage adirondack i...


Finally, we would like to have our response be a binary value: whether the asset relates to inverters or not. We can use the `remappings_asset.csv` file to help us with this.

In [8]:
remappings_df = pd.read_csv('example_data/remappings_asset.csv')
remappings_df[-10:] # end of the dataframe is more clear on what it is doing

Unnamed: 0,in,out_
44,Point of Interconnection,other
45,Racking/Trackers,tracker
46,Rooftop PV System,other
47,Site,other
48,String,other
49,String Inverter,inverter
50,Subarray,other
51,Summary,other
52,Tracker control unit,tracker
53,Tracking System,tracker


All we need to do is put together a dictionary that will let us map the asset in our dataframe to a more general asset description.

In [9]:
remapping_dict = {row['in'].lower() : row['out_'].lower() for _, row in remappings_df.iterrows()}
CM_nonnull['SimpleAsset'] = CM_nonnull['Asset'].apply(lambda x : remapping_dict[x.lower()])

# define our x as the general event description, and y as whether the asset is related to inverters
x = CM_nonnull['GeneralDescLower']
y = CM_nonnull['SimpleAsset'] == 'inverter'

### Model training and selection

Now, it's time to begin the model training. First, we need to turn our `x` into numerical data, which we can do via tf-idf vectorization. We'd like our vectorization to include the entire corpus of our event descriptions, even those where the asset is null. This is so we can use our exact same pipeline to predict the missing asset labels in the final step.

In [10]:
# fit the tf-idf vectorizer on the whole corpus, including the rows w/o asset that we plan to gapfill using our model
all_descr = CM_df.dropna(subset='GeneralDesc')['GeneralDesc']
all_descr = all_descr.str.lower().astype('str')
vectorizer = TfidfVectorizer(min_df=1, stop_words=stopwords, ngram_range=(1,2), sublinear_tf=True)
vectorizer.fit(all_descr);

Now, we can fit models across various parameters using the `classification_deployer()` function from `pvops.text.classify`. All we need to do is define our pipeline, our models, our parameters, and our scoring function.

The pipeline and scoring function are easy enough. We can get a good set of model parameters to search over from `pvops.text.defaults`, as shown below:

In [11]:
# first do vectorization, then classification
pipeline_steps = [('tfidf', vectorizer),
                  ('clf', None)]

# basic accuracy score metric
scorer = make_scorer(accuracy_score)

# models that we will use
model_names = ['LinearSVC', 'LogisticRegression', 'PassiveAggressiveClassifier']

# get the default parameter grid from pvops for these models
(params, model_instances) = supervised_classifier_defs('detailed')
classifiers = {model_name : model_instances[model_name] for model_name in model_names}
param_grid = {model_name : params[model_name] for model_name in model_names}

We can see below what our resulting `classifiers` and `param_grid` variables look like:

In [12]:
print(classifiers)
param_grid

{'LinearSVC': LinearSVC(), 'LogisticRegression': LogisticRegression(), 'PassiveAggressiveClassifier': PassiveAggressiveClassifier()}


{'LinearSVC': {'clf__C': [0.01, 0.1, 1, 10.0, 100.0, 1000.0],
  'clf__max_iter': [800, 1000, 1200, 1500, 2000]},
 'LogisticRegression': {'clf__solver': ['newton-cg', 'lbfgs', 'sag'],
  'clf__C': array([1.00000000e+00, 2.78255940e+00, 7.74263683e+00, 2.15443469e+01,
         5.99484250e+01, 1.66810054e+02, 4.64158883e+02, 1.29154967e+03,
         3.59381366e+03, 1.00000000e+04])},
 'PassiveAggressiveClassifier': {'clf__C': [0.001, 0.01, 0.1, 1.0],
  'clf__loss': ['hinge', 'squared_hinge']}}

Finally, all we need to do is pass everything into `classification_deployer()`:

In [13]:
(summary_table, best_model, _) = classification_deployer(x, y,
                                                            n_splits=5,
                                                            classifiers=classifiers,
                                                            search_space=param_grid,
                                                            pipeline_steps=pipeline_steps,
                                                            scoring=scorer)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Better score (0.894) found on classifier: LinearSVC
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Better score (0.902) found on classifier: PassiveAggressiveClassifier


And below, we can see a summary of how the models performed.

In [14]:
sorted_table = summary_table.sort_values('mean_score').iloc[::-1].head(10)
sorted_table

Unnamed: 0,clf__C,clf__max_iter,mean_fit_time,estimator,min_score,max_score,mean_score,std_score,clf__solver,clf__loss
62,0.01,,0.118527,PassiveAggressiveClassifier,0.869565,0.956522,0.902361,0.030812,,hinge
11,1.0,1000.0,0.111411,LinearSVC,0.858696,0.956522,0.893712,0.033946,,
14,1.0,2000.0,0.082835,LinearSVC,0.858696,0.956522,0.893712,0.033946,,
13,1.0,1500.0,0.088053,LinearSVC,0.858696,0.956522,0.893712,0.033946,,
12,1.0,1200.0,0.070389,LinearSVC,0.858696,0.956522,0.893712,0.033946,,
10,1.0,800.0,0.083629,LinearSVC,0.858696,0.956522,0.893712,0.033946,,
51,1291.549665,,0.120467,LogisticRegression,0.847826,0.956522,0.891585,0.03686,newton-cg,
45,166.810054,,0.120091,LogisticRegression,0.847826,0.956522,0.891561,0.036948,newton-cg,
48,464.158883,,0.120651,LogisticRegression,0.847826,0.956522,0.891561,0.035646,newton-cg,
61,0.001,,0.15885,PassiveAggressiveClassifier,0.858696,0.934783,0.891561,0.025618,,squared_hinge


So we got that a `PassiveAggressiveClassifier` did the best, although many models seemed to have performed about the same, around 90% accuracy. It could be argued then that the increased complexity of a `PassiveAggressiveClassifier` isn't worth the very small model improvement over the far simpler `LinearSVC` (a basic support vector machine).

Note that our best-performing models did perform notably better than our naive approach, so we do gain something by using machine learning methods.

`classification_deployer()` only returns the best model, but we can easily refit using just the parameters we want.

In [15]:
# get the row for the model we want to use
model_row = sorted_table.iloc[1,:]

# pull out the model name and parameters
model_name = model_row['estimator']
model_params = model_row[['clf__C', 'clf__max_iter']].to_dict()

# parameter values have to be in a list, even if there is just one
model_params = {param_name: [param_val] for param_name, param_val in model_params.items()}

# get into the form used by classification_deployer()
single_classifier = {model_name : model_instances[model_name]}
single_param_grid = {model_name : model_params}
single_classifier, single_param_grid

({'LinearSVC': LinearSVC()},
 {'LinearSVC': {'clf__C': [1], 'clf__max_iter': [1000]}})

Now that we have what we need, we can quickly rerun the fit and save the final model.

In [16]:
(_, final_model, _) = classification_deployer(x, y,
                                                n_splits=5,
                                                classifiers=single_classifier,
                                                search_space=single_param_grid,
                                                pipeline_steps=pipeline_steps,
                                                scoring=scorer)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


Better score (0.894) found on classifier: LinearSVC


### Gap-filling the asset using our final model

Now, we can use our model to make predictions on the rows with a missing `Asset` (but with a `GeneralDesc`). First, we can pull out the data we need to feed into the model.

In [17]:
records_to_gapfill = CM_df[CM_df['Asset'].isnull()
                           & ~CM_df['GeneralDesc'].isnull()].copy()
# only need to keep around the GeneralDesc field
records_to_gapfill = records_to_gapfill['GeneralDesc']

Now, we follow the same cleaning procedure as before, and finally use our model to predict the asset.

In [18]:
records_to_gapfill = records_to_gapfill.apply(lambda x: text_remove_numbers_stopwords(x, stopwords))
records_to_gapfill = records_to_gapfill.str.lower()
pred_is_inverter = final_model.predict(records_to_gapfill)

Now, we can use our prediction as a mask to pull out a list of indices where the asset is predicted to be an inverter.

In [19]:
pred_inverter_indices = records_to_gapfill.index[pred_is_inverter.astype(bool)]
pred_inverter_indices

Index([2, 18, 93, 98, 124, 254, 304, 332, 342, 418, 426], dtype='int64')

We can also construct a mask for the datapoints without null values for the asset. By combining these two masks, we get a final set of inverter-related records that can be used for other analysis.

In [20]:
inverter_indices = CM_df.index[~CM_df['Asset'].isnull()
                               & CM_df['Asset'].str.contains('Inverter')]
inverter_indices = list(inverter_indices) + list(pred_inverter_indices)

inverter_CM_df = CM_df.loc[inverter_indices, :]
inverter_CM_df

Unnamed: 0,randid,WONumber,WOStatus,WOType,Date_EventStart,Date_EventEnd,GeneralDesc,CompletionDesc,ProductionImpact_kWh,WarrantyClaim,...,CompletionActivity,FailureCategories,ImpactLevel,WarrantyType,InsuranceClaim,LaborEffort_Hrs,Cause,OpEventID,states,Commissioning_Date
0,C3S399,WO-0240712,Completed,Corrective,2018-12-31 07:25:00,2019-01-01 08:25:00,Saint Albans - String inverter 3-1-18 - Forced...,Issue self resolved on 01/01/19. TechDispatche...,,No,...,14 - Self Resolved,Failure Class 5 - combiner(s) off-line,Underperformance,,,0.0000,019 - Unplanned outage/derate.,OE-13999474,VT,2018-09-21
1,C3S177,WO-00007036,Closed,Corrective Maintenance,2017-10-16 10:40:00,2017-10-17 10:01:00,Mod 1.7 offline Update 10/16: Alarm on AlsoEne...,Upon arrival I was greeted by autumn landis wi...,,,...,Other,Production Outage,Production Outage,,,0.7956,,,NC,2017-07-20
5,C3S289,WO-1068074,Completed,Corrective,,,Adirondack - Inverter 3.10 - Forced Outage. 7/...,7/18/2019 Tim Chaput - Verified tripped AC bre...,,,...,01 - Replace,Failure Class 3 - single inverter off-line,Underperformance,,,7.6500,010 - Over temperature,,NY,2018-07-25
7,C5S153,5921,Closed,Corrective,2018-11-02 14:25:23,2018-11-05 18:30:39,McKenzie - Inverter C2 - Offline (Hurricane Fl...,S. Hanrahan 2018-11-02 10:26\nInverter is offl...,,,...,,AC Issues,Underperformance,,,7.5000,,,NC,2013-12-23
8,C3S66,WO-1070705,Completed,Corrective,,,IS 46 - Inv W_B3_1 - SMPS Fail Alarm. 8-Nov SC...,11/11: Inspection shows no issues with the inv...,,,...,09 - Inspection,"Failure Class 10 - equipment not off-line, no ...",Underperformance,,,2.0000,0000 - Unknown,,NC,2016-12-15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304,C1S72,T3-963,Resolved,Corrective,2018-08-13,,8/13 14kW inverter stopped producing with fau...,8/16 Ben found 30A fuse in disconnect blown. ...,,,...,,,,,,,,,HI,2015-03-05
332,C3S43,WO-00003900,Closed,Corrective Maintenance,2017-05-05,2017-05-09 18:15:00,Inverter 5 Faulted State = 0x5500: Stop due to...,Inverter came back online 05/09.,,,...,,,,,,0.0000,,,MN,2016-12-15
342,C2S130,16_001439,Closed,Corrective,2015-06-12 17:10:00,2015-06-13 09:25:00,Plant offline due to grid disturbance,Reclose. Hardware Replaced:,3136.0,No,...,Reclose,Grid-induced Failure/Suspension,Full,,,,,,CA,2014-07-10
418,C3S311,WO-00003541,Closed,Corrective Maintenance,2017-04-17,2017-04-17 13:40:00,Inverter is down and not communicating.. 6422,Power cycled inverter 1.,,,...,,,,,,0.0000,,,OR,2016-10-12


In [21]:
inverter_CM_df.to_csv('example_data/example_CM_inverter_records.csv', index=False)