<a href="https://colab.research.google.com/github/Yolantele/ML-data-clasifier/blob/master/SpaCy_ML_Classifier_for_Waste_Data_Augmentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

####**SpaCy Data Classification POC**

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [23]:
# mount data from drive
from google.colab import drive
drive.mount('/content/drive')
path = '/content/drive/My Drive/data/'


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install -U spacy==2.2.2
!pip install pandas

In [24]:
import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import pandas as pd
from spacy.lang.en import English
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin


# References:
# https://www.dataquest.io/blog/tutorial-text-classification-in-python-using-spacy/

spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm")

In [34]:
materials = pd.read_csv(path + '/enMaterialData.csv')
# or use test data frame where material field is empty
materials_test = pd.read_csv(path + '/enWithoutMaterialData.csv')

df = materials
df.head()
# df.info()
# df.description + df.euralDescription
# df.description + df.euralDescription


Unnamed: 0,reason,origin,color,state,size,consistency,otherCode,material4,material3,material2,material,mType,composite2,composite1,cType,indirectProduct,directProduct,pType,mixedOrPure,cleanOrDirty,euralDescription,euralCode,description,/0
0,,,,,,slurry,,,,oil,organic material,Soy,,dry,,dry,,,1.0,,material unsuitable for consumption or processing,20304,Soyadroes technically,
1,,,,,,vast,,,,,wood,branches,,,,wood,,,0.0,,waste from forestry,20107,Branches,
2,,,,,,vast,,,,,cocoa,caps,,,,cocoa,,,0.0,,material unsuitable for consumption or processing,20304,Cocoa shells,
3,,,,debris,,vast,,,,,sand,,,debris,,debris,,,1.0,1.0,"mixtures of concrete, stones, tiles or ceramic...",170107,Debris with Sand,
4,,,,,,vast,,,,,organic material,Soy,,,,organic material,,,0.0,,material unsuitable for consumption or processing,20304,soya,


###Tokening the Data With spaCy

Now that we know what we’re working with, let’s create a custom tokenizer function using spaCy. We’ll use this function to automatically strip information we don’t need, like stopwords and punctuation, from each review.

In [115]:

parser = English()

def spacy_tokenize(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    sentence = sentence.strip().lower()
    mytokens = parser(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens


###Vectorization Feature Engineering (TF-IDF) , Bag of Words and N-grams

Classifying text we end up with text snippets with their respective labels. But in machine learning model we need to convert into numeric representation

TF-IDF -Term Frequency-Inverse Document Frequency - simply a way of normalizing our Bag of Words(BoW) by looking at each word’s frequency in comparison to the document frequency.

N-grams - combinations of adjacent words in a given text. For example "who will win"
- when n = 1, becomes "who", "will", "win"
- when n = 2 , becomes "who will", "will win" etc. 

In [116]:
#bag of words vector
bow_vector = CountVectorizer(tokenizer=spacy_tokenize, ngram_range=(1,1))
print(bow_vector)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<function spacy_tokenize at 0x7f5ebe885e18>,
                vocabulary=None)


In [117]:
tfidf_vector = TfidfVectorizer(tokenizer=spacy_tokenize)
print(tfidf_vector)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<function spacy_tokenize at 0x7f5ebe885e18>,
                use_idf=True, vocabulary=None)


### Splitting The Data into Training and Validation Sets


In [118]:
from sklearn.model_selection import train_test_split

X = df['description'] # the features we want to analyze
ylabels = df['material'] # the labels, or answers, we want to test against

X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.3)

print(X)

0                                   Soyadroes technically
1                                                Branches
2                                            Cocoa shells
3                                        Debris with Sand
4                                                    soya
                              ...                        
2115                                          LDPE - 98:2
2116    hollow variegated glass, route collection fire...
2117                  Soil mixed with contaminated stones
2118    Stainless steel, Motors, Refiner, Tin / Lead, ...
2119     soil / gravel / stones / cloths oil contaminated
Name: description, Length: 2120, dtype: object


### Creating a Pipeline and Generating the Model

In [119]:
# Creating Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

classifier = LogisticRegression()

# Create pipeline using Bag of Words
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', bow_vector),
                 ('classifier', classifier)])

# model generation
pipe.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('cleaner', <__main__.predictors object at 0x7f5eb771e080>),
                ('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 t...\\b\\w\\w+\\b',
                                 tokenizer=<function spacy_tokenize at 0x7f5ebe885e18>,
                                 vocabulary=None)),
                ('classifier',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
             

In [121]:
from sklearn import metrics
row = 33
# Predicting with a test dataset
predicted = pipe.predict(X_test)


print('material description was ----> ', X_test.iloc[row])
print('material predicted is ----->', predicted[row])

material description was ---->  BRAC ind reusable N building material
material predicted is -----> organic material


In [184]:

# Model Accuracy
print("Accuracy:",metrics.accuracy_score(y_test, predicted ))
print("Precision:",metrics.precision_score(y_test, predicted, average='weighted'))
print("Recall:",metrics.recall_score(y_test, predicted, average='weighted'))


print(metrics.classification_report(y_test, predicted))

Accuracy: 0.8286163522012578
Precision: 0.8213044253850307
Recall: 0.8286163522012578
                     precision    recall  f1-score   support

                EPS       1.00      0.83      0.91         6
              Glass       0.94      0.94      0.94        16
               HDPE       1.00      0.50      0.67         2
               LDPE       1.00      1.00      1.00         9
                 ON       1.00      1.00      1.00         2
                 PC       0.00      0.00      0.00         2
                PET       1.00      1.00      1.00         1
               PFOS       0.00      0.00      0.00         1
               PMMA       0.00      0.00      0.00         3
                 PP       0.75      1.00      0.86         3
                 PS       1.00      0.67      0.80         3
                PVC       1.00      0.67      0.80         3
            Perlite       0.00      0.00      0.00         1
                RVS       0.33      0.33      0.33         

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Plot the description and material Outcomes

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
# plot
chosen_material = 'ground'


data = df.loc[df.material ==chosen_material]
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(1, 85)
sns.regplot(x='material', y='description', data=data, ax=ax)
