# Projet de modélisation des fonctionnalités de prédiction des tags 

## Imports

In [11]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime


from nltk.corpus import wordnet
from nltk.corpus import stopwords
import nltk
import nltk.data
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
import string
from langdetect import detect
import re
import spacy
from spacy.symbols import ORTH, NORM
from langdetect import detect_langs
import langid
from bs4 import BeautifulSoup

import gensim
from gensim import corpora, models
from pprint import pprint


import warnings
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', None)


## Import dataset

In [2]:
data = pd.read_csv('./dataset/preprocessed_dataset.csv',sep=",",nrows=1000)

In [3]:
data.head()

Unnamed: 0,body,tags
0,programmat edit rout tabl write daemon run emb platform need chang default rout devic accord interfac connect give time programat know use system rout del default rout add default gateway blah etc direct way updat solv particular problem discov patch pppd allow replacedefaultrout option patch also includ code programmat modifi rout tabl includ gentoo bug report,"c,linux,networking"
1,eras content graphic gdi+ use gdi+ c++ two problem find canvasimg black chang color white mean want white canva draw thing canva clear canva mani thank,"canvas,gdi+"
2,access state individu bit word mip write program need determin bit set know rotat word leav right shift access individu bite state use bitwis oper like xor,"assembly,bit-manipulation,mips"
3,handl close applic event java consol applic server accept sever connect client possibl listen event close applic want event tell connect client gentl disconnect applic realli close solut thank,"java,events,listener"
4,read file content client side javascript various browser attempt provid script solut read content file client machin browser solut work firefox internet explor pretti tri thing moment call write content filecont text area way browser concern safari chrome moment open suggest browser edit respons question want basic want hash file content togeth one time password client side send inform back verif,"javascript,html,file-io,sandbox"


## Bag of Words

In [4]:
def tokenize(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS:
            result.append(token)
    return result

processed_body = data['body'].map(tokenize)
processed_body[:10]

dictionary = gensim.corpora.Dictionary(processed_body)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 accord
1 add
2 allow
3 blah
4 bug
5 chang
6 code
7 connect
8 daemon
9 default
10 del


### Gensim doc2bow

On crée pour chaque post un dictionnaire qui met en perspective combien de mots il comprend et combien de fois ce mot apparait.

In [5]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_body]

In [8]:
bow_corpus[1]

[(5, 1),
 (28, 1),
 (38, 1),
 (41, 1),
 (42, 3),
 (43, 1),
 (44, 1),
 (45, 1),
 (46, 1),
 (47, 1),
 (48, 1),
 (49, 2),
 (50, 1),
 (51, 1),
 (52, 1),
 (53, 1),
 (54, 1),
 (55, 1),
 (56, 2)]

### Preview du Bag of Words

In [9]:
bow_show = bow_corpus[10]
for i in range(len(bow_show)):
    print("Word {} (\"{}\") appears {} time.".format(bow_show[i][0], 
                                               dictionary[bow_show[i][0]], 
                                                bow_show[i][1]))

Word 2 ("allow") appears 1 time.
Word 28 ("problem") appears 2 time.
Word 38 ("use") appears 4 time.
Word 39 ("way") appears 4 time.
Word 52 ("mean") appears 1 time.
Word 53 ("thank") appears 1 time.
Word 54 ("thing") appears 1 time.
Word 55 ("want") appears 2 time.
Word 57 ("access") appears 2 time.
Word 64 ("like") appears 3 time.
Word 87 ("realli") appears 1 time.
Word 90 ("solut") appears 1 time.
Word 110 ("pretti") appears 1 time.
Word 112 ("question") appears 1 time.
Word 123 ("work") appears 2 time.
Word 126 ("deriv") appears 5 time.
Word 134 ("implement") appears 3 time.
Word 135 ("list") appears 1 time.
Word 138 ("object") appears 1 time.
Word 141 ("type") appears 11 time.
Word 143 ("actual") appears 1 time.
Word 166 ("class") appears 2 time.
Word 172 ("day") appears 1 time.
Word 194 ("obvious") appears 1 time.
Word 209 ("turn") appears 1 time.
Word 219 ("come") appears 2 time.
Word 228 ("fair") appears 1 time.
Word 233 ("input") appears 1 time.
Word 276 ("store") appears 3 ti

## TF-IDF

Création d'un modèle TF-IDF sur bow_corpus

In [12]:
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.1061548209262387),
 (1, 0.05448144943500308),
 (2, 0.08236867615996893),
 (3, 0.14733915900622319),
 (4, 0.10450332741215089),
 (5, 0.053157571397341015),
 (6, 0.036744320464184396),
 (7, 0.07692938445334771),
 (8, 0.13997758535378826),
 (9, 0.25564457623776204),
 (10, 0.13426750521544115),
 (11, 0.11922647471702073),
 (12, 0.08841769195542001),
 (13, 0.12565742697602728),
 (14, 0.05919225548252903),
 (15, 0.1061548209262387),
 (16, 0.17545184329542562),
 (17, 0.17545184329542562),
 (18, 0.14967860685507386),
 (19, 0.09412777209376712),
 (20, 0.04299796055040209),
 (21, 0.10010580513078761),
 (22, 0.03911539211051107),
 (23, 0.08236867615996893),
 (24, 0.0867661984413322),
 (25, 0.27995517070757653),
 (26, 0.1061548209262387),
 (27, 0.17545184329542562),
 (28, 0.04590374219042576),
 (29, 0.12960203003540446),
 (30, 0.23306075248924493),
 (31, 0.10148934574620203),
 (32, 0.6480101501770223),
 (33, 0.04899883095440542),
 (34, 0.09308316713545667),
 (35, 0.12234504711571867),
 (36,

## Exécution d'un LDA en utilisant notre Bag of Words

In [17]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=50, id2word=dictionary, passes=2, workers=2)

In [18]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.015*"like" + 0.013*"work" + 0.012*"use" + 0.010*"code" + 0.010*"differ" + 0.009*"test" + 0.008*"implement" + 0.008*"set" + 0.007*"server" + 0.007*"data"
Topic: 1 
Words: 0.021*"use" + 0.015*"servic" + 0.010*"web" + 0.010*"html" + 0.009*"url" + 0.009*"want" + 0.009*"method" + 0.009*"net" + 0.008*"need" + 0.008*"code"
Topic: 2 
Words: 0.021*"use" + 0.016*"class" + 0.015*"applic" + 0.013*"problem" + 0.012*"file" + 0.010*"charact" + 0.009*"databas" + 0.009*"like" + 0.008*"code" + 0.008*"project"
Topic: 3 
Words: 0.111*"student" + 0.015*"cursor" + 0.014*"jboss" + 0.014*"control" + 0.014*"free" + 0.013*"studentlist" + 0.012*"return" + 0.012*"alloc" + 0.010*"code" + 0.010*"pattern"
Topic: 4 
Words: 0.022*"data" + 0.016*"file" + 0.015*"use" + 0.013*"row" + 0.011*"chang" + 0.011*"text" + 0.011*"tri" + 0.010*"user" + 0.010*"want" + 0.009*"way"
Topic: 5 
Words: 0.017*"process" + 0.015*"file" + 0.014*"net" + 0.014*"mx" + 0.012*"asp" + 0.012*"object" + 0.011*"client" + 0.010*"nee

## Exécution d'un LDA utilisant notre TF-IDF

In [19]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=50, id2word=dictionary, passes=2, workers=4)

In [39]:
print('\nPerplexity Score: ' + str(lda_model_tfidf.log_perplexity(corpus_tfidf)) + '\n')
#coherence_model_lda = CoherenceModel(model=lda_model_tfidf, texts=data_bigrams, dictionary=id2word, coherence='c_v')
#print('\nCoherence Score: ', coherence_model_lda.get_coherence())


Perplexity Score: -18.577464167855613



In [20]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.011*"button" + 0.008*"print" + 0.007*"asp" + 0.006*"html" + 0.006*"postback" + 0.005*"linq" + 0.005*"click" + 0.005*"insert" + 0.005*"net" + 0.005*"hello"
Topic: 1 Word: 0.006*"argument" + 0.006*"compat" + 0.006*"browser" + 0.005*"test" + 0.005*"bash" + 0.005*"script" + 0.005*"return" + 0.004*"close" + 0.004*"function" + 0.004*"formview"
Topic: 2 Word: 0.009*"api" + 0.007*"method" + 0.005*"class" + 0.005*"file" + 0.005*"lectur" + 0.004*"odbc" + 0.004*"dbf" + 0.004*"function" + 0.004*"creat" + 0.004*"dataitem"
Topic: 3 Word: 0.010*"tabl" + 0.010*"page" + 0.009*"load" + 0.008*"text" + 0.007*"word" + 0.007*"javascript" + 0.007*"data" + 0.006*"read" + 0.006*"datetim" + 0.006*"databas"
Topic: 4 Word: 0.007*"declar" + 0.005*"probe" + 0.005*"implement" + 0.004*"email" + 0.004*"runtim" + 0.004*"sign" + 0.004*"interfac" + 0.004*"ilist" + 0.004*"mono" + 0.004*"viewcontrol"
Topic: 5 Word: 0.008*"cluster" + 0.008*"string" + 0.006*"sql" + 0.006*"databas" + 0.006*"newlin" + 0.006*"e

#### Création d'un dictionnaire des topics

In [37]:
topics_dictionary = []
for idx, topic in lda_model_tfidf.print_topics(-1):
    #t = set(topic.split('+'))
    #to_show = ' '.join(t)        
    #to_show = re.sub("[0-9]", "", topic)

    list_of_char = ['.', '*', '+', '"', "0",'1','2','3','4','5','6','7','8','9']
    pattern = '[' + ''.join(list_of_char) + ']'    
    to_show = re.sub(pattern, '', topic)
    
    print('Topic: {} Words: {}'.format(idx, to_show))
    topics_dictionary.append(to_show)

Topic: 0 Words: button  print  asp  html  postback  linq  click  insert  net  hello
Topic: 1 Words: argument  compat  browser  test  bash  script  return  close  function  formview
Topic: 2 Words: api  method  class  file  lectur  odbc  dbf  function  creat  dataitem
Topic: 3 Words: tabl  page  load  text  word  javascript  data  read  datetim  databas
Topic: 4 Words: declar  probe  implement  email  runtim  sign  interfac  ilist  mono  viewcontrol
Topic: 5 Words: cluster  string  sql  databas  newlin  express  port  heartbeat  line  iseri
Topic: 6 Words: page  session  free  gwt  maximum  result  mani  time  master  tabl
Topic: 7 Words: servic  dictionari  setter  duplic  code  dlls  textwrap  python  list  past
Topic: 8 Words: block  car  user  commit  pcap  asynchron  strength  comput  profil  packet
Topic: 9 Words: control  variabl  design  list  collect  ienumer  button  global  trust  helper
Topic: 10 Words: webservic  live  widget  oper  view  cursor  status  vba  hold  index
To

In [38]:
topics_dictionary[0]

'button  print  asp  html  postback  linq  click  insert  net  hello'

In [40]:
topicNames_1 = []
for idx in range(0, gcv_lda_1.best_params_["n_components"]):
    topicNames_1.append("Topic_{}".format(idx))

df_topics_pred_1 = pd.DataFrame(data=topics_pred_1,
                                index=df_quest_cleaned.index,
                               columns=topicNames_1)

NameError: name 'gcv_lda_1' is not defined

## Prédiction de topic pour un post donné

In [22]:
def find_topics(post_id, df, df_topics_pred, topic_dict, freq_min=0.1):
    print("Post n°{}:".format(post_id))
    print(df.iloc[post_id].body)
    print("\nTags : ", df.iloc[post_id].tags)
    print("")
    
    fig = plt.figure(figsize=(8, 6))
    df_topics_pred.iloc[post_id].plot()
    #for topic_id_curr in np.nonzero(
    #    df_topics_pred.iloc[quest_iloc] > freq_min)[0]:
    list_topic_labels = df_topics_pred.iloc[post_id][df_topics_pred.iloc[post_id] > freq_min].sort_values(ascending=False).index
    print(list_topic_labels)
    list_topic_id =[]
    for label_curr in list_topic_labels:
        list_topic_id.append(int(re.findall(r"\d+", label_curr)[0]))
    print(list_topic_id)
    for topic_id_curr in list_topic_id:
        print("Topic {} : {}".format(topic_id_curr, 
                                     topic_dict[topic_id_curr]))
    return list(list_topic_id)
    """

In [23]:
find_topics(0,data,topics_dictionary,data,0.1)

Post n°0:
programmat edit rout tabl write daemon run emb platform need chang default rout devic accord interfac connect give time programat know use system rout del default rout add default gateway blah etc direct way updat solv particular problem discov patch pppd allow replacedefaultrout option patch also includ code programmat modifi rout tabl includ gentoo bug report

Tags :  c,linux,networking

