<h1>Projet_OC_05 : Catégorisez automatiquement des questions (API)</h1>

# Sommaire :

**Partie 1 : Configuration du notebook**

 - <a href="#C11">P1.1 : Chargement des librairies </a>
 - <a href="#C12">P1.2 : Fonctions </a>
 - <a href="#C13">P1.3 : Chargement des données</a>
 
**Partie 2 : Représentation de données**

 - <a href="#C21">P2.1 : Structure de données </a>
 - <a href="#C22">P2.2 : NaN et doublons </a>
 - <a href="#C23">P2.3 : Inspection de données </a> 
 
**Partie 3 : Analyse de données**

 - <a href="#C31">P3.1 : Structure de données </a>
 - <a href="#C32">P3.2 : Nettoyage de données </a>
 - <a href="#C33">P3.3 : Visualisation par wordcloud </a>
 
  
**Partie 4 : Enregistrement de données**

 - <a href="#C41">P4.1 : nregistrement de données </a>

<h1>Partie 1 : Configuration du notebook</h1>

# <a name="C11"> P1.1 : Chargement des librairies </a>

In [10]:
import os, sys, time

import numpy as np
import pandas as pd

import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, wordpunct_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import spacy
from bs4 import BeautifulSoup
import string

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier, LogisticRegression, Perceptron
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import hamming_loss, jaccard_score
import tensorflow_hub as hub

import joblib
from mlflow.models.signature import infer_signature
import mlflow.sklearn 

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

**La liste des librairies ci-dessus sont chargées.**

# <a name="C12"> P1.2 : Fonctions </a>

In [11]:
def tags_process(sentence, most_freq_tags):
    
    sentence_process = (sentence.replace('<', ' ').replace('>', ' ').replace('/', ' ').strip()).split()
    
    sentence_filter = [word for word in sentence_process if word in most_freq_tags]                
    
    if sentence_filter:
        return ' '.join(sentence_filter)
    else:
        return np.nan

In [39]:
def tag_ponc_process(sentence):
    
    return sentence.replace('c#', 'csharp').replace('c++', 'cplusplus').replace('.net', 'dotnet').replace('objective-c', 'objectivec').replace('ruby-on-rails', 'rubyonrails')\
                .replace('sql-server', 'sqlserver').replace('node.js', 'nodedotjs').replace('aspdotnet-mvc', 'aspdotnetmvc').replace('visual-studio', 'visualstudio').replace('visual studio', 'visualstudio')\
                .replace('unit-testing', 'unittesting').replace('cocoa-touch', 'cocoatouch').replace('python-3.x', 'python3x').replace('entity-framework', 'entityframework')\
                .replace('language-agnostic', 'languageagnostic').replace('amazon-web-services', 'amazonwebservices').replace('google-chrome', 'googlechrome').replace('user-interface', 'userinterface')\
                .replace('design-patterns', 'designpatterns').replace('version-control', 'versioncontrol').strip()

In [47]:
class TXTProcesser(TransformerMixin, BaseEstimator):
    
    def __init__(self, stop_words, authorized_pos, no_pos_tag_list, no_lem_stem_list, nlp, embed):
        
        self.stop_words = stop_words
        self.authorized_pos = authorized_pos
        self.no_pos_tag_list = no_pos_tag_list
        self.no_lem_stem_list = no_lem_stem_list
        self.nlp = nlp
        self.embed = embed
    
    def fit(self, X, Y=None):

        return self
    
    def transform(self, X, Y=None):
        
        X_clean = []
        
        if isinstance(X, list):
            for x in X:
                X_clean.append(self.txt_clean(x))            
    
            return self.txt_use_feature(X_clean)
            
        else:        
            
            for x in [X]*10:
                X_clean.append(self.txt_clean(x))            

            return [self.txt_use_feature(X_clean)[0]]
    
    def txt_clean(self, X, Y=None):
  
        sentence_lower = X.lower()
    
        sentence_no_html_raw = BeautifulSoup(sentence_lower, "html.parser")

        for data in sentence_no_html_raw(['style', 'script', 'code', 'a']):
            # Remove tags
            data.decompose()

        sentence_no_html = ' '.join(sentence_no_html_raw.stripped_strings)

        sentence_no_abb = sentence_no_html.replace("what's", "what is ").replace("\'ve", " have ").replace("can't", "can not ").replace("n't", " not ").replace("i'm", "i am ")\
                           .replace("\'re", " are ").replace("\'d", " would ").replace("\'ll", " will ").replace("\'scuse", " excuse ").replace(' vs ', ' ').replace('difference between', ' ')

        sentence_no_abb_trans = sentence_no_abb.replace('c#', 'csharp').replace('c++', 'cplusplus').replace('.net', 'dotnet').replace('objective-c', 'objectivec').replace('ruby-on-rails', 'rubyonrails')\
                .replace('sql-server', 'sqlserver').replace('node.js', 'nodedotjs').replace('aspdotnet-mvc', 'aspdotnetmvc').replace('visual-studio', 'visualstudio').replace('visual studio', 'visualstudio')\
                .replace('unit-testing', 'unittesting').replace('cocoa-touch', 'cocoatouch').replace('python-3.x', 'python3x').replace('entity-framework', 'entityframework')\
                .replace('language-agnostic', 'languageagnostic').replace('amazon-web-services', 'amazonwebservices').replace('google-chrome', 'googlechrome').replace('user-interface', 'userinterface')\
                .replace('design-patterns', 'designpatterns').replace('version-control', 'versioncontrol').strip()

        sentence_no_new_line = re.sub(r'\n', ' ', sentence_no_abb_trans)

        translator = str.maketrans(dict.fromkeys(string.punctuation, ' '))
        sentence_no_caracter = sentence_no_new_line.translate(translator)

        sentence_no_stopwords = ' '.join([word for word in sentence_no_caracter.split() if word not in self.stop_words])

        sentence_tokens =  [token.text for token in self.nlp(sentence_no_stopwords) if token.tag_ in self.authorized_pos and len(token.text)>=3 or token.text in self.no_pos_tag_list] 


        lemmatizer = WordNetLemmatizer()
        lem_or_stem_tokens = [lemmatizer.lemmatize(word) if word not in self.no_lem_stem_list else word for word in sentence_tokens]


        final_sentence = ' '.join(sentence_tokens).replace('csharp', 'c#').replace('cplusplus', 'c++').replace('dotnet', '.net').replace('objectivec', 'objective-c').replace('rubyonrails', 'ruby-on-rails')\
                .replace('sqlserver', 'sql-server').replace('nodedotjs', 'node.js').replace('aspdotnetmvc', 'aspdotnet-mvc').replace('visualstudio', 'visual-studio')\
                .replace('unittesting', 'unit-testing').replace('cocoatouch', 'cocoa-touch').replace('python3x', 'python-3.x').replace('entityframework', 'entity-framework')\
                .replace('languageagnostic', 'language-agnostic').replace('amazonwebservices', 'amazon-web-services').replace('googlechrome', 'google-chrome').replace('userinterface', 'user-interface')\
                .replace('designpatterns', 'design-patterns').replace('versioncontrol', 'version-control').strip()

        return final_sentence  
    
    def txt_use_feature(self, X, Y=None):
        
        batch_size = 8
        
        for step in range(len(X)//batch_size) :
            idx = step*batch_size
            feat = self.embed(X[idx:idx+batch_size])

            if step ==0 :
                features = feat
            else :
                features = np.concatenate((features,feat))

        return features

In [54]:
class TXTModel(TransformerMixin, BaseEstimator):
    
    def __init__(self, clf, ml_binarizer, nmp):
        self.clf = clf
        self.ml_binarizer = ml_binarizer
        self.nmp = nmp
        
    def transform(self, Y):
        
        return self.ml_binarizer.transform(Y) 
    
    def fit(self, X, Y):
        self.ml_binarizer.fit(Y)
        self.clf.fit(X, self.ml_binarizer.transform(Y))
        
    def predict(self, X):
        
        return self.clf.predict(X)
    
    def decision_function(self, X):

        dfun = self.clf.decision_function(X)
        most_common_idx = self.nmp.argsort(dfun)[:, -5:]
        return self.classes_(most_common_idx)
        
    def inverse_transform(self, Yt):
        
        return self.ml_binarizer.inverse_transform(Yt)      
    
    def classes_(self, Y_idx):
        
        return self.ml_binarizer.classes_[Y_idx]
    

# <a name="C13"> P1.3 : Chargement des données </a>

In [35]:
raw_txt_data = pd.read_csv('data.csv')
raw_txt_data = raw_txt_data.select_dtypes(include=object)
raw_txt_data.dropna(inplace=True)

print('-'*150)
print('Data size:', raw_txt_data.shape)
print('-'*150)
raw_txt_data.head()

------------------------------------------------------------------------------------------------------------------------------------------------------
Data size: (99997, 3)
------------------------------------------------------------------------------------------------------------------------------------------------------


Unnamed: 0,Title,Body,Tags
0,Find Mime type of file or url using php for al...,<p>Hi I am looking for best way to find out mi...,<php><amazon-web-services><mime-types><content...
1,native zlib inflate/deflate for swift3 on iOS,<p>I'd like to be able to inflate/deflate Swif...,<ios><swift><swift3><zlib><swift-data>
2,`Sudo pip install matplotlib` fails to find fr...,<p>I already have <code>matplotlib-1.2.1</code...,<python><numpy><matplotlib><homebrew><osx-mave...
3,Serialization in C# without using file system,<p>I have a simple 2D array of strings and I w...,<c#><sharepoint><serialization><moss><wss>
4,How do I prevent IIS from compiling website?,<p>I have an ASP .NET web application which on...,<asp.net><performance><web-services><iis><asmx>


In [36]:
most_common_val = 50
all_tags = ' '.join(raw_txt_data.Tags.apply(lambda sentence: sentence.replace('<', ' ').replace('>', ' ')).tolist()).split()
unique_tags = list(set(all_tags))
keywords = nltk.FreqDist(all_tags)
most_common_tags = [word[0] for word in keywords.most_common(most_common_val)]

raw_txt_data['Tags'] = raw_txt_data.Tags.apply(lambda sentence: tags_process(sentence, most_common_tags))
raw_txt_data.dropna(inplace=True)

In [55]:
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
authorized_pos = ['NN', 'NNS', 'NNP', 'NNPS']
no_pos_tag_list = tag_ponc_process(' '.join(most_common_tags)).split()
no_lem_stem_list = tag_ponc_process(' '.join(most_common_tags)).split()
stop_words = list(set(stopwords.words('english'))) + \
                ['[', ']', ',', '.', ':', '?', '(', ')']
stop_words.extend(['good', 'idea', 'solution', 'issue', 'problem', 'way', 'example', 'case', 'question', 'questions', 'something', 'everything',
                   'anything', 'thing', 'things', 'answer', 'thank', 'thanks', 'none', 'end', 'anyone', 'test', 'lot', 'one', 'someone', 'help'])


clf = OneVsRestClassifier(LinearSVC())
ml_binarizer = MultiLabelBinarizer()
nmp = np

In [67]:
data = raw_txt_data.sample(frac=0.1)

X = (data.Title + ' ' + data.Body).tolist()

y = data.Tags
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1) 

idx_norm = 8

X_train = X_train[:(len(X_train)//idx_norm)*idx_norm]
X_test = X_test[:(len(X_test)//idx_norm)*idx_norm]
y_train = y_train[:(len(y_train)//idx_norm)*idx_norm]
y_test = y_test[:(len(y_test)//idx_norm)*idx_norm]

y_train_list = y_train.apply(lambda x: x.split()).tolist()
y_test_list = y_test.apply(lambda x: x.split()).tolist()

# <a name="C21"> P2.1 : Pipeline </a>

In [68]:
pipe = Pipeline([('transformer', TXTProcesser(stop_words, authorized_pos, no_pos_tag_list, no_lem_stem_list, nlp, embed)),
                 ('model', TXTModel(clf, ml_binarizer, nmp))])

In [69]:
pipe.fit(X_train, y_train_list)



# <a name="C22"> P2.2 : déploiement du pipeline </a>

In [25]:
joblib.dump('pipe', 'pipeline_housing.joblib')

['pipeline_housing.joblib']

In [33]:
signature = infer_signature(np.array(X_train[:(len(X_train)//10)*10]), np.array(y_train_list[:(len(X_train)//10)*10]))

  signature = infer_signature(np.array(X_train[:(len(X_train)//10)*10]), np.array(y_train_list[:(len(X_train)//10)*10]))


In [34]:
mlflow.sklearn.save_model(pipe, 'mlflow_model', signature)

PickleError: Can't pickle repeated scalar fields, convert to list first