In [96]:
import pandas as pd
import numpy as np
from ast import literal_eval

from sklearn.preprocessing import MultiLabelBinarizer
import gensim.corpora as corpora
import gensim
import joblib

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
import sklearn.metrics as metrics

In [97]:
data = pd.read_csv("../QueryResults_Clear.csv",
                   converters={"MainWordTitle": literal_eval,
                               "MainWordBody": literal_eval,
                               "MainTags": literal_eval})
data.head()

Unnamed: 0,Id,MainTags,MainWordBody,MainWordTitle
0,7477,"[javascript, html, css]","[sale, application, company, form, user, deliv...","[textarea, prototype]"
1,7492,[performance],"[past, microsoft, web, application, stress, to...","[stress, test, web, application]"
2,17806,[.net],"[application, project, project, others, projec...","[warning, conflict, version, assembly]"
3,8896,"[c, #, c++, windows]","[window, environment, solution, aspnet, mvc, s...","[window, environment]"
4,8948,[python],"[metadata, python]","[metadata, python]"


In [98]:
Xall = data['MainWordBody'] + data['MainWordTitle']

Y = data['MainTags']
# Binarise multilabel
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(Y)
Y_multi = multilabel_binarizer.transform(Y)

Xall

0        [sale, application, company, form, user, deliv...
1        [past, microsoft, web, application, stress, to...
2        [application, project, project, others, projec...
3        [window, environment, solution, aspnet, mvc, s...
4                     [metadata, python, metadata, python]
                               ...                        
19388    [python, ubuntu, lts, link, doubt, resource, c...
19389    [repository, commit, repository, github, diffe...
19390    [module, routing, state, user, search, documen...
19391    [question, target, rgb, color, formula, black,...
19392    [studio, type, class, library, class, library,...
Length: 19393, dtype: object

In [99]:
#Init TFIDF vector
vectorizer = TfidfVectorizer(analyzer="word",
                             max_df=.6,
                             min_df=0.005,
                             tokenizer=None,
                             preprocessor=' '.join,
                             stop_words=None,
                             lowercase=False)
vectorizer.fit(Xall)
X_tfidf = vectorizer.transform(Xall)

In [100]:
# Bag of words
bagOfWords = corpora.Dictionary(Xall)
bagOfWords.filter_extremes(no_below=4, no_above=0.6, keep_n=None)

# Term Document Frequency 
corpus = [bagOfWords.doc2bow(text) for text in Xall]  
# View 
[[(bagOfWords[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('address', 5),
  ('application', 1),
  ('area', 4),
  ('bit', 1),
  ('character', 1),
  ('chris', 1),
  ('code', 1),
  ('company', 1),
  ('consideration', 1),
  ('delivery', 1),
  ('detail', 1),
  ('field', 1),
  ('font', 1),
  ('form', 1),
  ('idea', 1),
  ('information', 1),
  ('iso', 1),
  ('keyup', 1),
  ('line', 2),
  ('number', 1),
  ('people', 1),
  ('point', 1),
  ('problem', 3),
  ('prototype', 1),
  ('reason', 1),
  ('resizing', 1),
  ('sale', 1),
  ('screen', 1),
  ('screenshot', 2),
  ('scrollbar', 1),
  ('shot', 1),
  ('size', 1),
  ('space', 2),
  ('text', 5),
  ('textarea', 3),
  ('user', 3),
  ('way', 1),
  ('width', 1),
  ('window', 1)]]

In [101]:
metricsEval = pd.DataFrame(columns=["nom","Accuracy","Jaccard","Precision"])

In [102]:
# calcul de metrics
def calculMetric(model,y,y_pred):
    locData = [model]
    acc = metrics.accuracy_score(y,y_pred)
    locData.append(acc)
    jac = metrics.jaccard_score(y,y_pred,average='weighted')
    locData.append(jac)
    prec = metrics.precision_score(y,y_pred,average='weighted')
    locData.append(prec)
    
    metricsEval.loc[len(metricsEval)]=locData

# LDA

In [103]:
lda_model = gensim.models.ldamulticore\
                    .LdaMulticore(corpus=corpus,
                                  id2word=bagOfWords,
                                  num_topics=20,
                                  random_state=8,
                                  per_word_topics=True,
                                  workers=4)

In [104]:
# Calculate Document/topic matrix with Gensim
doc_topic = pd.DataFrame(lda_model.get_document_topics(corpus,
                                                  minimum_probability=0))
for topic in doc_topic.columns:
    doc_topic[topic] = doc_topic[topic].apply(lambda x : x[1])

In [105]:
# Matricial multiplication with Document / Topics transpose
topic_tag = np.matmul(doc_topic.T, Y_multi)

In [106]:
y_results = pd.DataFrame(Y)
y_results["best_topic"] = doc_topic.idxmax(axis=1).values
y_results["nb_tags"] = y_results["MainTags"].apply(lambda x : len(x))

df_y_bin = pd.DataFrame(Y_multi)
df_dict = dict(
    list(
        df_y_bin.groupby(df_y_bin.index)
    )
)

tags_num = []
for k, v in df_dict.items():
    check = v.columns[(v == 1).any()]
    tags_num.append(check.to_list())

y_results["y_true"] = tags_num

In [107]:
# Select predicted tags in Topics / Tags matrix
list_tag = []
for row in y_results.itertuples():
    nb_tags = row.nb_tags
    best_topic = row.best_topic
    row_tags = list(topic_tag.iloc[best_topic]\
                    .sort_values(ascending=False)[0:nb_tags].index)
    list_tag.append(row_tags)
    
y_results["y_pred"] = list_tag

In [108]:
# Create matrix for pred and true y LDA
lda_y_pred = np.zeros(Y_multi.shape)
n = 0
for row in y_results.y_pred.values:
    for i in range(len(row)):
        lda_y_pred[n,row[i]] = 1
    n+=1
    
lda_y_true = np.zeros(Y_multi.shape)
m = 0
for row in y_results.y_true.values:
    for i in range(len(row)):
        lda_y_true[m,row[i]] = 1
    m+=1

In [109]:
calculMetric("LDA",lda_y_true,lda_y_pred)

  _warn_prf(average, modifier, msg_start, len(result))


# Regression linéaire

In [110]:
# Create train and test split (80%)
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, Y_multi,
                                                    test_size=0.2)

In [111]:
param_logit = {"estimator__C": [100, 10, 1.0, 0.1],
               "estimator__penalty": ["l1", "l2"],
               "estimator__dual": [False],
               "estimator__solver": ["liblinear"]}

multi_logit_cv = GridSearchCV(OneVsRestClassifier(LogisticRegression()),
                              param_grid=param_logit,
                              n_jobs=-1,
                              cv=5,
                              scoring="f1_weighted",
                              return_train_score = True,
                              refit=True)

multi_logit_cv.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=OneVsRestClassifier(estimator=LogisticRegression()),
             n_jobs=-1,
             param_grid={'estimator__C': [100, 10, 1.0, 0.1],
                         'estimator__dual': [False],
                         'estimator__penalty': ['l1', 'l2'],
                         'estimator__solver': ['liblinear']},
             return_train_score=True, scoring='f1_weighted')

In [112]:
print(multi_logit_cv.best_params_)

{'estimator__C': 10, 'estimator__dual': False, 'estimator__penalty': 'l1', 'estimator__solver': 'liblinear'}


In [113]:
# Predict
y_pred_logreg = multi_logit_cv.predict(X_test)
# Inverse transform
y_pred_inversed = multilabel_binarizer.inverse_transform(y_pred_logreg)
y_inversed = multilabel_binarizer.inverse_transform(y_test)

print(y_pred_inversed[0:5])
print('-'*10)
print(y_inversed[0:5])

[(), (), (), (), ()]
----------
[('javascript', 'performance'), ('ruby',), ('java',), ('java',), ('android', 'java')]


In [114]:
calculMetric("RegLog",y_test,y_pred_logreg)

In [115]:
print(metricsEval)

      nom  Accuracy   Jaccard  Precision
0     LDA  0.156964  0.090159   0.184534
1  RegLog  0.389791  0.451121   0.736817


# Export pkl

In [117]:
joblib.dump(multi_logit_cv,'logit_nlp_model.pkl')
joblib.dump(vectorizer,'tfidf_vectorizer.pkl')
joblib.dump(multilabel_binarizer,'multilabel_binarizer.pkl')

['multilabel_binarizer.pkl']