**Notebook dédié à:** 
* la récupération des données à partir du fichier texte. 
* la création du dataframe. 
* l'analyse exploratoire des données textuelles. 

# Importer les librairies 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import FreqDist

from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from scipy import  sparse
from wordcloud import WordCloud 
import re
import plotly.express as px

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Création du dataframe à partir du fichier texte  

In [3]:
isAbstract=False
isAuthors=False
title,year,abstract="","",""
authors,refs=[],[]
Rows =[]

In [None]:

for line in open('/content/drive/MyDrive/Colab Notebooks/Business Intelligence/Projet /DBLP_Subset.txt','r'):

    if line.startswith('#*'): #Title
      title=line.strip('#*\n')
      #print(title)
      continue


    
    if line.startswith('#t'): #Year
      year=line.strip('#t\n')
      continue

    
    if line.startswith('#c'): #Publication venue
      venue=line.strip('#c\n')
      continue

    if line.startswith('#index'): #Index
      index=line.strip('#index\n')
      continue

    if line.startswith('#%'): #Ids
      if refs==None:
        refs=[] 
      refs.append(line.strip('#%\n'))
      continue

        
    if line == '\n' :
      row={"Title":title,
           "Authors":authors,
           "Pub_Venue":venue,
           "Year":year,
           "Abstract":abstract,
           "Index":index,
           "Id":refs}

      Rows.append(row)
      isAbstract=False
      isAuthors=False
      id,title,year,abstract=None,None,None,None
      authors,refs=None,None
      continue

    if line.startswith('#!') or isAbstract: #Abstract
      isAbstract=True
      if abstract==None:
        abstract=""
      abstract+=line.strip('#!\n')+" "
      continue

    if line.startswith('#@') or isAuthors:  #Author
      isAuthors=True
      if authors==None:
        authors=[] 
      authors.append(line.strip('#@\n'))      
      continue

    

data = pd.DataFrame(Rows)
#modification de l'ordre des colonnes 
data = data[["Pub_Venue","Year","Authors","Title","Index","Id","Abstract"]]

In [None]:
data.shape

(37966, 7)

In [None]:
data

Unnamed: 0,Pub_Venue,Year,Authors,Title,Index,Id,Abstract
0,DAC,1988,"[Chung-Kuan Cheng, David N. Deutsch]",Improved Channel Routing by Via Minimization a...,131751,"[133716, 133521, 134343]",Channel routing area improvement by means of v...
1,DAC,2006,"[Lei Cheng, Liang Deng, Deming Chen, Martin D....",A fast simultaneous input vector generation an...,131752,"[132550, 530568, 436486, 134259, 283007, 13442...",Input vector control (IVC) technique is based ...
2,DAC,1992,"[Kwang-Ting Cheng, Hi-Keung Tony Ma]",On the Over-Specification Problem in Sequentia...,131756,"[455537, 1078626, 131745]",The authors show that some ATPG (automatic tes...
3,DAC,2005,"[Lerong Cheng, Phoebe Wong, Fei Li, Yan Lin, L...",Device and architecture co-optimization for FP...,131759,"[214244, 215701, 214503, 282575, 214411, 21450...",Device optimization considering supply voltage...
4,DAC,1989,"[Wu-Tung Cheng, Meng-Lin Yu]",Differential Fault Simulation - a Fast Method ...,131760,"[131744, 806030]",A new fast fault simulator called differentia...
...,...,...,...,...,...,...,...
37961,Journal of Systems and Software,2010,[Magne Jørgensen],Selection of strategies in judgment-based effo...,1600529,"[996865, 601059, 361510, 492786, 997634, 11285...",We currently know little about the factors tha...
37962,Journal of Systems and Software,2010,"[Abbas Nayebi, Hamid Sarbazi-Azad, Gunnar Karl...",Performance analysis of opportunistic broadcas...,1600531,"[505564, 1114157, 412964, 588689, 53668, 69288...",This paper investigates a class of mobile wire...
37963,Journal of Systems and Software,2010,"[Rossella Fortuna, Luigi Alfredo Grieco, Genna...",Quality adaptive end-to-end packet scheduling ...,1600532,"[396959, 794228, 588835, 997951]",In Internet multimedia streaming the quality ...
37964,Journal of Systems and Software,2010,"[Richard Werner Nelem Pazzi, Zhenxia Zhang, Az...",Design and evaluation of a novel MAC layer han...,1600537,"[666821, 784037, 506991, 505779, 1247751]",In recent years the IEEE 802.11 wireless netw...


## Identifier les attributs qui contiennent des valeurs manquantes 

In [None]:
data.isnull().sum(axis = 0)

Pub_Venue        0
Year             3
Authors          3
Title            3
Index            0
Id           19220
Abstract     17409
dtype: int64

### Suppression des lignes contenant des valeurs manquantes 

In [None]:
### Suppression des lignes contenant des valeurs manquantes 
data.dropna(subset = ["Year"], inplace=True)
data.dropna(subset = ["Authors"], inplace=True)
data.dropna(subset = ["Title"], inplace=True)
data.dropna(subset = ["Id"], inplace=True)
data.dropna(subset = ["Abstract"], inplace=True)



In [None]:
data.shape

(18105, 7)

## Supression des revues qui ont moins de 50 articles publiés

In [None]:
# Supression des revues qui ont moins de 50 articles publiés 
data = data.groupby('Pub_Venue').filter(lambda x : len(x)>49)

In [None]:
data

Unnamed: 0,Pub_Venue,Year,Authors,Title,Index,Id,Abstract
0,DAC,1988,"[Chung-Kuan Cheng, David N. Deutsch]",Improved Channel Routing by Via Minimization a...,131751,"[133716, 133521, 134343]",Channel routing area improvement by means of v...
1,DAC,2006,"[Lei Cheng, Liang Deng, Deming Chen, Martin D....",A fast simultaneous input vector generation an...,131752,"[132550, 530568, 436486, 134259, 283007, 13442...",Input vector control (IVC) technique is based ...
2,DAC,1992,"[Kwang-Ting Cheng, Hi-Keung Tony Ma]",On the Over-Specification Problem in Sequentia...,131756,"[455537, 1078626, 131745]",The authors show that some ATPG (automatic tes...
3,DAC,2005,"[Lerong Cheng, Phoebe Wong, Fei Li, Yan Lin, L...",Device and architecture co-optimization for FP...,131759,"[214244, 215701, 214503, 282575, 214411, 21450...",Device optimization considering supply voltage...
4,DAC,1989,"[Wu-Tung Cheng, Meng-Lin Yu]",Differential Fault Simulation - a Fast Method ...,131760,"[131744, 806030]",A new fast fault simulator called differentia...
...,...,...,...,...,...,...,...
37961,Journal of Systems and Software,2010,[Magne Jørgensen],Selection of strategies in judgment-based effo...,1600529,"[996865, 601059, 361510, 492786, 997634, 11285...",We currently know little about the factors tha...
37962,Journal of Systems and Software,2010,"[Abbas Nayebi, Hamid Sarbazi-Azad, Gunnar Karl...",Performance analysis of opportunistic broadcas...,1600531,"[505564, 1114157, 412964, 588689, 53668, 69288...",This paper investigates a class of mobile wire...
37963,Journal of Systems and Software,2010,"[Rossella Fortuna, Luigi Alfredo Grieco, Genna...",Quality adaptive end-to-end packet scheduling ...,1600532,"[396959, 794228, 588835, 997951]",In Internet multimedia streaming the quality ...
37964,Journal of Systems and Software,2010,"[Richard Werner Nelem Pazzi, Zhenxia Zhang, Az...",Design and evaluation of a novel MAC layer han...,1600537,"[666821, 784037, 506991, 505779, 1247751]",In recent years the IEEE 802.11 wireless netw...


In [None]:
len(data['Pub_Venue'].unique())

28

*   28 revues scientifiques distinctes contenant plus de 50 publications d'articles 





In [None]:
data

Unnamed: 0,Pub_Venue,Year,Authors,Title,Index,Id,Abstract
0,DAC,1988,"[Chung-Kuan Cheng, David N. Deutsch]",Improved Channel Routing by Via Minimization a...,131751,"[133716, 133521, 134343]",Channel routing area improvement by means of v...
1,DAC,2006,"[Lei Cheng, Liang Deng, Deming Chen, Martin D....",A fast simultaneous input vector generation an...,131752,"[132550, 530568, 436486, 134259, 283007, 13442...",Input vector control (IVC) technique is based ...
2,DAC,1992,"[Kwang-Ting Cheng, Hi-Keung Tony Ma]",On the Over-Specification Problem in Sequentia...,131756,"[455537, 1078626, 131745]",The authors show that some ATPG (automatic tes...
3,DAC,2005,"[Lerong Cheng, Phoebe Wong, Fei Li, Yan Lin, L...",Device and architecture co-optimization for FP...,131759,"[214244, 215701, 214503, 282575, 214411, 21450...",Device optimization considering supply voltage...
4,DAC,1989,"[Wu-Tung Cheng, Meng-Lin Yu]",Differential Fault Simulation - a Fast Method ...,131760,"[131744, 806030]",A new fast fault simulator called differentia...
...,...,...,...,...,...,...,...
37961,Journal of Systems and Software,2010,[Magne Jørgensen],Selection of strategies in judgment-based effo...,1600529,"[996865, 601059, 361510, 492786, 997634, 11285...",We currently know little about the factors tha...
37962,Journal of Systems and Software,2010,"[Abbas Nayebi, Hamid Sarbazi-Azad, Gunnar Karl...",Performance analysis of opportunistic broadcas...,1600531,"[505564, 1114157, 412964, 588689, 53668, 69288...",This paper investigates a class of mobile wire...
37963,Journal of Systems and Software,2010,"[Rossella Fortuna, Luigi Alfredo Grieco, Genna...",Quality adaptive end-to-end packet scheduling ...,1600532,"[396959, 794228, 588835, 997951]",In Internet multimedia streaming the quality ...
37964,Journal of Systems and Software,2010,"[Richard Werner Nelem Pazzi, Zhenxia Zhang, Az...",Design and evaluation of a novel MAC layer han...,1600537,"[666821, 784037, 506991, 505779, 1247751]",In recent years the IEEE 802.11 wireless netw...


In [None]:
DF = data

Unnamed: 0,Pub_Venue,Year,Authors,Title,Index,Id,Abstract
0,DAC,1988,"['Chung-Kuan Cheng', 'David N. Deutsch']",Improved Channel Routing by Via Minimization a...,131751,"['133716', '133521', '134343']",Channel routing area improvement by means of v...
1,DAC,2006,"['Lei Cheng', 'Liang Deng', 'Deming Chen', 'Ma...",A fast simultaneous input vector generation an...,131752,"['132550', '530568', '436486', '134259', '2830...",Input vector control (IVC) technique is based ...
2,DAC,1992,"['Kwang-Ting Cheng', 'Hi-Keung Tony Ma']",On the Over-Specification Problem in Sequentia...,131756,"['455537', '1078626', '131745']",The authors show that some ATPG (automatic tes...
3,DAC,2005,"['Lerong Cheng', 'Phoebe Wong', 'Fei Li', 'Yan...",Device and architecture co-optimization for FP...,131759,"['214244', '215701', '214503', '282575', '2144...",Device optimization considering supply voltage...
4,DAC,1989,"['Wu-Tung Cheng', 'Meng-Lin Yu']",Differential Fault Simulation - a Fast Method ...,131760,"['131744', '806030']",A new fast fault simulator called differentia...
...,...,...,...,...,...,...,...
17628,Journal of Systems and Software,2010,['Magne Jørgensen'],Selection of strategies in judgment-based effo...,1600529,"['996865', '601059', '361510', '492786', '9976...",We currently know little about the factors tha...
17629,Journal of Systems and Software,2010,"['Abbas Nayebi', 'Hamid Sarbazi-Azad', 'Gunnar...",Performance analysis of opportunistic broadcas...,1600531,"['505564', '1114157', '412964', '588689', '536...",This paper investigates a class of mobile wire...
17630,Journal of Systems and Software,2010,"['Rossella Fortuna', 'Luigi Alfredo Grieco', '...",Quality adaptive end-to-end packet scheduling ...,1600532,"['396959', '794228', '588835', '997951']",In Internet multimedia streaming the quality ...
17631,Journal of Systems and Software,2010,"['Richard Werner Nelem Pazzi', 'Zhenxia Zhang'...",Design and evaluation of a novel MAC layer han...,1600537,"['666821', '784037', '506991', '505779', '1247...",In recent years the IEEE 802.11 wireless netw...


# Traitement de texte sur les abstracts

## Nettoyage du texte 

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
stop_words = stopwords.words('english')
stop_words.append('method')
stop_words.append('paper')
stop_words.append('model')
stop_words.append('problem')
stop_words.append('algorithm')
stop_words.append('approach')
stop_words.append('based')
stop_words.append('using')
stop_words.append('show')
stop_words.append('set')
stop_words.append('also')
stop_words.append('present')
stop_words.append('new')
stop_words.append('used')
stop_words.append('one')
stop_words.append('use')
stop_words.append('provide')
stop_words.append('given')
stop_words.append('proposed')
stop_words.append('describe')
stop_words.append('different')
stop_words.append('two')







def clean_text(string : str, punctuations = r'''!()-{}[];:'"\,<>./?@#^%^&*_~''', stop_words = stop_words) -> str:
    """
     A method to clean text
    """
 #Cleaning the urls
    string = re.sub(r'https?://\S+|www\.\S+','',string)

 #Cleaning the html elements
    string = re.sub(r'<.*?>','',string)

 #Cleaning numbers
    string= re.sub(r'[0-9]+','',string)

  #Removing puntuations
    for x in string.lower():
      if x in punctuations:
        string=string.replace(x,"")

  #Converting the text to lower
    string = string.lower()

  #Removing stop words
    string = ' '.join([word for word in string.split() if word not in stop_words])

  #Cleaning the whitespaces
    string = re.sub(r'\s+',' ',string).strip()
    return string


In [None]:
abstracts = DF['Abstract'].apply(str)
# Clean Abstracts
abstract_cleaned = [clean_text(x) for x in abstracts]

#abstract_cleaned[1]

## Lemmatisation du texte

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
lemmatizer = WordNetLemmatizer()

def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

abstract_lemmatized = []
for text in abstract_cleaned:
  t = lemmatize_words(text)
  abstract_lemmatized.append(t)

#abstract_lemmatized[1]

# Matrice Documents- Authors 

In [None]:
#Document_authors = DF.groupby('Authors')
Document_authors =  DF.groupby( [ "Authors", "Title"]) 

In [None]:
Document_authors

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7ff3adaf10d0>

# Analyse exploratoire des données 

## Les auteurs les plus productifs.




*   **Extraction des auteurs et leur productivité en fonction du nombre de documents publiés**




In [None]:
#Extraction des auteurs et leur productivité en fonction des nombre de productions faites 
authors = DF.iloc[:,2].values
authors = " ".join(map(str, authors))
corp = authors.split(",")
freq = nltk.FreqDist(corp)
dict_freq = {}
for key, value in freq.items():
  dict_freq[key] = [key, value]

data_frame_freq = pd.DataFrame(dict_freq)
data_frame_freq = data_frame_freq.transpose()
data_frame_freq.columns = ['Author', 'Number of publications']



*   **Barplot pour le top 30 des auteurs les plus productifs**




In [None]:
#Barplot pour le top 30 des auteurs les plus productifs

px.bar(data_frame_freq.sort_values(by="Number of publications", ascending=False).iloc[:30,:], x='Author', y='Number of publications', color='Number of publications',
       title='Top 30 des auteurs les plus productifs', template='plotly_white', labels={'Author': 'Auteurs', 'Number of publications': 'Nombre de publications'})

## Les articles les plus populaires en fonction du nombre des citations.




*   **Barplot pour le top 10 des articles les plus populaires**



In [None]:
#Extraction des articles les plus populaires en fonction du nombre des citations
#Barplot pour le top 10 des articles les plus populaires
citations = DF.iloc[:,5].values
nb_citations = []
for i in range(len(citations)):
  citation = str(citations[i])
  if len(citation.split()) == 1:
    nb_citations.append(0)
  else:
    nb_citations.append(len(citation.split()))
nb_citations.sort(reverse=True)
px.bar(x=DF.iloc[:,3].values[:10], y=nb_citations[:10], color=nb_citations[:10] , height=1000, title="Top 10 des articles les plus populaires", labels={"x":"Articles", "y":"Nombre de citations"})

## Les années avec une grande productivité en terme de publication d'articles 


In [None]:

px.histogram(data_frame=DF, x="Year", labels={"x":"Année", "y":"Productivité"}, color_discrete_sequence=["red"], title="Productivité par année" )

## Distribution du nombre d’articles par revue/conférence.


In [None]:
#Distribution du nombre d'articles par revue
publications = DF.Pub_Venue.value_counts()
values = publications.values
keys = publications.keys()
px.bar(x=keys, y=values, title="Distribution du nombre d'articles par revue", height=800,color=values, labels={"x":"Revues", "y":"Nombre d'articles"})

## Les termes les plus utilisés 



*   **Distribution des mots les plus utilisés sous forme de nuage de mots**


In [None]:

from wordcloud import WordCloud 
corpus = abstract_lemmatized
wordcloud = WordCloud(background_color="white")
text = " ".join(corpus)
wordcloud.generate(text)
px.imshow(wordcloud)



*   **Distribution des termes les plus fréquents sous forme de barplot**




In [None]:

corpus = abstract_lemmatized
texts = " ".join(corpus)

corp = text.split()
freq = nltk.FreqDist(corp)
for key, value in freq.items():
  dict_freq[key] = [key, value]

data_frame_freq = pd.DataFrame(dict_freq)
data_frame_freq = data_frame_freq.transpose()
data_frame_freq.columns = ['word', 'count']

px.bar(data_frame_freq.sort_values(by="count", ascending=False).iloc[:25,:], x='word', y='count',
       title="Fréquence d'apparition de mots", template='plotly_white', color_discrete_sequence=["black"], labels={'word': 'termes', 'count': 'Nombre'})



In [None]:
DF.head(4)

Unnamed: 0,Pub_Venue,Year,Authors,Title,Index,Id,Abstract
0,DAC,1988,"['Chung-Kuan Cheng', 'David N. Deutsch']",Improved Channel Routing by Via Minimization a...,131751,"['133716', '133521', '134343']",Channel routing area improvement by means of v...
1,DAC,2006,"['Lei Cheng', 'Liang Deng', 'Deming Chen', 'Ma...",A fast simultaneous input vector generation an...,131752,"['132550', '530568', '436486', '134259', '2830...",Input vector control (IVC) technique is based ...
2,DAC,1992,"['Kwang-Ting Cheng', 'Hi-Keung Tony Ma']",On the Over-Specification Problem in Sequentia...,131756,"['455537', '1078626', '131745']",The authors show that some ATPG (automatic tes...
3,DAC,2005,"['Lerong Cheng', 'Phoebe Wong', 'Fei Li', 'Yan...",Device and architecture co-optimization for FP...,131759,"['214244', '215701', '214503', '282575', '2144...",Device optimization considering supply voltage...


In [None]:
DF['Abstract'] = abstract_lemmatized

In [None]:
#save dataframe to csv 
# avec les abstracts nettoyés et lémmatisés 
DF.to_csv('/content/drive/MyDrive/Colab Notebooks/Business Intelligence/Projet /DBLP_Subset_processed.txt', index=False)