In [72]:
# Bibliothèques
import pandas as pd
import scipy.sparse
import pickle
import nltk
from nltk.tokenize.treebank import TreebankWordDetokenizer
from bs4 import BeautifulSoup

In [73]:
# fonction pour le TF_IDF_Vectorize
def text_splitter(text):
    return text.split(' ')

In [74]:
# Modèles et données

# tokenizer, lemmatizer and stemmer model
with open('C:/Users/ayoub/OCP6 Data/tokenizer.pk', 'rb') as fp:
    tokenizer = pickle.load(fp)
with open('C:/Users/ayoub/OCP6 Data/lemmatizer.pk', 'rb') as fp:
    lemmatizer = pickle.load(fp)
with open('C:/Users/ayoub/OCP6 Data/p_stemmer.pk', 'rb') as fp:
    p_stemmer = pickle.load(fp)

# NLTK STOPWORDS
engstpw = nltk.corpus.stopwords.words('english')

# bigram and trigram models
with open('C:/Users/ayoub/OCP6 Data/bigram_mod.pk', 'rb') as fp:
    bigram_mod = pickle.load(fp)
with open('C:/Users/ayoub/OCP6 Data/trigram_mod.pk', 'rb') as fp:
    trigram_mod = pickle.load(fp)

# TF_IDF_vectorizer
with open('C:/Users/ayoub/OCP6 Data/Tvectorizer.pk', 'rb') as fp:
    Tvectorizer = pickle.load(fp)

# MultiLabelBinarizer
with open('C:/Users/ayoub/OCP6 Data/mlb_object.pk', 'rb') as fp:
    mlb_object = pickle.load(fp)

# Dimensions reduction
with open('C:/Users/ayoub/OCP6 Data/svdt.sav', 'rb') as fp:
    svdt = pickle.load(fp)

# Classifier
with open('C:/Users/ayoub/OCP6 Data/sgd_clf.sav', 'rb') as fp:
    sgd_clf = pickle.load(fp)


In [75]:
# main function with html input
def predict(question):
    
    q = BeautifulSoup(question, 'lxml').text
    q = tokenizer.tokenize(q.lower())
    q = [token for token in q if token not in engstpw]
    q = [lemmatizer.lemmatize(token) for token in q]
    q = [p_stemmer.stem(token) for token in q]    
    q = trigram_mod[bigram_mod[q]]
    q = Tvectorizer.transform([TreebankWordDetokenizer().detokenize(q)])
    q = svdt.transform(q)
    q = scipy.sparse.csr_matrix(q)
    q = sgd_clf.predict(q)
    q = mlb_object.inverse_transform(q)
    
    return(q)


In [112]:
import json

In [113]:
json.dumps([('css', 'internet-explorer', 'internet-explorer-7')])

'[["css", "internet-explorer", "internet-explorer-7"]]'

In [114]:
# main function with a string input
def predict2(question):
    
    q = question
    q = tokenizer.tokenize(q.lower())
    q = [token for token in q if token not in engstpw]
    q = [lemmatizer.lemmatize(token) for token in q]
    q = [p_stemmer.stem(token) for token in q]    
    q = trigram_mod[bigram_mod[q]]
    q = Tvectorizer.transform([TreebankWordDetokenizer().detokenize(q)])
    q = svdt.transform(q)
    q = scipy.sparse.csr_matrix(q)
    q = sgd_clf.predict(q)
    q = mlb_object.inverse_transform(q)
    
    return(json.dumps(q))


In [None]:
import flask
from flask import request, jsonify

app = flask.Flask(__name__)

# Create some test data for our catalog in the form of a list of dictionaries.


@app.route('/predict_tags/', methods=['GET'])
def home():
    data = request.args
    question = data['question']
    return predict2(question)

app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [20/Feb/2020 13:21:37] "GET /predict_tags/?question=Percentage%20width%20child%20element%20in%20absolutely%20positioned%20parent%20on%20Internet%20Explorer%207I%20have%20an%20absolutely%20positioned%20div%20containing%20several%20children,%20one%20of%20which%20is%20a%20relatively%20positioned%20div.%20When%20I%20use%20a%20percentage-based%20width%20on%20the%20child%20div,%20it%20collapses%20to%200%20width%20on%20IE7,%20but%20not%20on%20Firefox%20or%20Safari.\nIf%20I%20use%20pixel%20width,%20it%20works.%20If%20the%20parent%20is%20relatively%20positioned,%20the%20percentage%20width%20on%20the%20child%20works.\n\nIs%20there%20something%20I%27m%20missing%20here?\nIs%20there%20an%20easy%20fix%20for%20this%20besides%20the%20pixel-based%20width%20on%20the\nchild?\nIs%20there%20an%20area%20of%20the%20CSS%20specification%20that%20covers%20this?\n\n HTTP/1.1" 200 -


## test

In [76]:
df1 = pd.read_csv('C:/Users/ayoub/OCP6 Data/ndf1.csv')

In [77]:
question = df1.iloc[1].Title + df1.iloc[1].Body

In [80]:
BeautifulSoup(question, 'lxml').text

"Percentage width child element in absolutely positioned parent on Internet Explorer 7I have an absolutely positioned div containing several children, one of which is a relatively positioned div. When I use a percentage-based width on the child div, it collapses to 0 width on IE7, but not on Firefox or Safari.\nIf I use pixel width, it works. If the parent is relatively positioned, the percentage width on the child works.\n\nIs there something I'm missing here?\nIs there an easy fix for this besides the pixel-based width on the\nchild?\nIs there an area of the CSS specification that covers this?\n\n"

In [79]:
predict(question)

[('css', 'internet-explorer', 'internet-explorer-7')]

In [85]:
df1.iloc[1].Tags

'<html><css><internet-explorer-7>'

In [81]:
question2 = df1.iloc[500].Title + df1.iloc[500].Body

In [82]:
BeautifulSoup(question2, 'lxml').text

'How to pass a comma separated list to a stored procedure?So I have a Sybase stored proc that takes 1 parameter that\'s a comma separated list of strings and runs a query with in in an IN() clause:\nCREATE PROCEDURE getSomething @keyList varchar(4096)\nAS\nSELECT * FROM mytbl WHERE name IN (@keyList)\n\nHow do I call my stored proc with more than 1 value in the list?\nSo far I\'ve tried \nexec getSomething \'John\'         -- works but only 1 value\nexec getSomething \'John\',\'Tom\'   -- doesn\'t work - expects two variables\nexec getSomething "\'John\',\'Tom\'" -- doesn\'t work - doesn\'t find anything\nexec getSomething \'"John","Tom"\' -- doesn\'t work - doesn\'t find anything\nexec getSomething \'\\\'John\\\',\\\'Tom\\\'\' -- doesn\'t work - syntax error\n\nEDIT: I actually found this page that has a great reference of the various ways to pas an array to a sproc\n'

In [83]:
predict(question2)

[('sql', 'sybase-ase')]

In [86]:
df1.iloc[500].Tags

'<sql><sybase-ase>'