In [1]:
import pandas as pd
import csv
import numpy as np

#Load Data
dfDblp = pd.read_csv('dblp.csv', sep=',')
dfAcm = pd.read_csv('acm.csv', sep=',')
dfScholar = pd.read_csv('scholar.csv', sep=',')


#Load True Matches
dfAcmGoldStandard = pd.read_csv('goldstandard_citation_true_acm.csv', sep=',', header=None)
dfScholarGoldStandard = pd.read_csv('goldstandard_citation_true_scholar.csv', sep=',', header=None)

In [2]:
display(dfAcm.describe(include='all'))

Unnamed: 0,authors,frequent_part_title,infrequent_part_title,title,uri,venue,year
count,2258,1297,2269,2271,2271,2271,2271.0
unique,2008,80,2207,2216,2271,5,
top,karl aberer,data,editorial guest,guest editorial,acm_1664,international conference on management of data,
freq,9,271,8,8,1,797,
mean,,,,,,,1998.511669
std,,,,,,,2.829162
min,,,,,,,1994.0
25%,,,,,,,1996.0
50%,,,,,,,1999.0
75%,,,,,,,2001.0


In [3]:
dfDblp['year'] = dfDblp['year'].astype('str')
dfDblpInfo = dfDblp.filter(items = ['authors', 'title', 'venue', 'year', 'frequent_part_title', 'infrequent_part_title']).describe()

display(dfDblpInfo)
datatypes = {'authors': ['list'], 'title': ['string'], 'venue': ['string'], 
             'year': ['date'], 'frequent_part_title': ['string'], 'infrequent_part_title': ['string']}

dfDblpProfiling = pd.DataFrame(data= datatypes)
dfDblpProfiling = dfDblpProfiling.append((dfDblpInfo.loc[['count']] / len(dfDblp)).astype(np.float).round(3), sort=True)
dfDblpProfiling = dfDblpProfiling.append((dfDblpInfo.loc[['unique']] / dfDblpInfo.loc[['count']].values).astype(np.float).round(3), sort=True)

dfDblpProfiling.index = ['Data Type', 'Density', 'Uniqueness']

dfDblpProfiling.to_csv('dblp_citation_profiling.csv', sep=',', encoding='utf-8', index=True, quotechar='"', quoting=csv.QUOTE_ALL)
print(dfDblpProfiling.transpose().filter(items=['Density', 'Uniqueness']).to_latex())

Unnamed: 0,authors,title,venue,year,frequent_part_title,infrequent_part_title
count,4899,5099,4924,5099,2868,5097
unique,4389,2519,6,10,82,2503
top,richard t. snodgrass,reminiscences on influential papers,vldb,2003,data,influential papers reminiscences
freq,18,25,1737,670,641,27


\begin{tabular}{lll}
\toprule
{} & Density & Uniqueness \\
\midrule
authors               &   0.961 &      0.896 \\
frequent\_part\_title   &   0.562 &      0.029 \\
infrequent\_part\_title &       1 &      0.491 \\
title                 &       1 &      0.494 \\
venue                 &   0.966 &      0.001 \\
year                  &       1 &      0.002 \\
\bottomrule
\end{tabular}



In [4]:
dfAcm['year'] = dfAcm['year'].astype('str')
#dfAcm['frequent_part_title'] = dfAcm['frequent_part_title'].astype('str')
#dfAcm['infrequent_part_title'] = dfAcm['infrequent_part_title'].astype('str')
dfAcmInfo = dfAcm.filter(items = ['authors', 'title', 'venue', 'year', 'frequent_part_title', 'infrequent_part_title']).describe(include='all')


#display(dfDnbInfo)
datatypes = {'authors': ['list'], 'title': ['string'], 'venue': ['string'], 
             'year': ['date'], 'frequent_part_title': ['string'], 'infrequent_part_title': ['string']}

dfAcmProfiling = pd.DataFrame(data= datatypes)
dfAcmProfiling = dfAcmProfiling.append((dfAcmInfo.loc[['count']] / len(dfAcm)).astype(np.float).round(3), sort = False)
dfAcmProfiling = dfAcmProfiling.append((dfAcmInfo.loc[['unique']] / dfAcmInfo.loc[['count']].values).astype(np.float).round(3), sort = False)

dfAcmProfiling.index = ['Data Type', 'Density', 'Uniqueness']
display(dfAcmProfiling)

Unnamed: 0,authors,frequent_part_title,infrequent_part_title,title,venue,year
Data Type,list,string,string,string,string,date
Density,0.994,0.571,0.999,1,1,1
Uniqueness,0.889,0.062,0.973,0.976,0.002,0.004


In [5]:
from nltk import word_tokenize, ngrams
from nltk.metrics import jaccard_distance
import numpy as np

mapping = {"authors": "authors", "title": "title", "venue": "venue", "year": "year", "frequent_part_title": "frequent_part_title", "infrequent_part_title": "infrequent_part_title"}
jaccard_similarity_counter = {"authors" : 0, "title": 0, "venue":0, "year": 0, "frequent_part_title": 0, "infrequent_part_title": 0}

for index, row in dfAcmGoldStandard.iterrows():
    rowDblp = dfDblp[dfDblp['uri'] == row[0]]
    rowAcm = dfAcm[dfAcm['uri'] == row[1]]
    for key in mapping:
        #display(rowDblp)
        #display(rowAcm)
        if key in rowAcm and len(rowAcm[key].values) > 0 and mapping[key] in rowDblp and len(rowDblp[mapping[key]].values) > 0:
            if type(rowAcm[key].values[0]) not in[np.float64, float] and type(rowDblp[mapping[key]].values[0]) not in[np.float64, float]:
                jaccard_similarity_counter[key] += jaccard_distance(set(ngrams(str(rowDblp[mapping[key]].values[0]), 2)), set(ngrams(str(rowAcm[key].values[0]), 2)))
    
for key in jaccard_similarity_counter:
    jaccard_similarity_counter[key] = round(jaccard_similarity_counter[key] / len(dfAcmGoldStandard),3)
 
#display(jaccard_similarity_counter)
dfAcmProfiling = dfAcmProfiling.append(pd.DataFrame(data=jaccard_similarity_counter, index=['Heterogeneity']))
dfAcmProfiling.to_csv('acm_citation_profiling.csv', sep=',', encoding='utf-8', index=True, quotechar='"', quoting=csv.QUOTE_ALL)
print(dfAcmProfiling.transpose().filter(items=['Density', 'Uniqueness', 'Heterogeneity']).to_latex())

\begin{tabular}{llll}
\toprule
{} & Density & Uniqueness & Heterogeneity \\
\midrule
authors               &   0.994 &      0.889 &         0.104 \\
frequent\_part\_title   &   0.571 &      0.062 &         0.001 \\
infrequent\_part\_title &   0.999 &      0.973 &         0.016 \\
title                 &       1 &      0.976 &         0.017 \\
venue                 &       1 &      0.002 &         0.718 \\
year                  &       1 &      0.004 &             0 \\
\bottomrule
\end{tabular}



In [6]:
dfScholar['year'] = dfScholar['year'].astype('str')
dfScholarInfo = dfScholar.filter(items = ['authors', 'title', 'venue', 'year', 'frequent_part_title', 'infrequent_part_title']).describe(include='all')
display(dfScholarInfo)
datatypes = {'authors': ['list'], 'title': ['string'], 'venue': ['string'], 
             'year': ['date'], 'frequent_part_title': ['string'], 'infrequent_part_title': ['string']}

dfScholarProfiling = pd.DataFrame(data= datatypes)
dfScholarProfiling = dfScholarProfiling.append((dfScholarInfo.loc[['count']] / len(dfScholar)).astype(np.float).round(3), sort = False)
dfScholarProfiling = dfScholarProfiling.append((dfScholarInfo.loc[['unique']] / dfScholarInfo.loc[['count']].values).astype(np.float).round(3), sort = False)

dfScholarProfiling.index = ['Data Type', 'Density', 'Uniqueness']
display(dfScholarProfiling)

Unnamed: 0,authors,title,venue,year,frequent_part_title,infrequent_part_title
count,64257,64258,49261,64258.0,15091,64194
unique,55619,62704,12206,128.0,135,61718
top,acms anthology,foreword,"lecture notes in computer science ,",,data,foreword
freq,107,51,1004,34786.0,2340,51


Unnamed: 0,authors,frequent_part_title,infrequent_part_title,title,venue,year
Data Type,list,string,string,string,string,date
Density,1,0.235,0.999,1,0.767,1
Uniqueness,0.866,0.009,0.961,0.976,0.248,0.002


In [7]:
from nltk import word_tokenize, ngrams
from nltk.metrics import jaccard_distance
import numpy as np

mapping = {"authors": "authors", "title": "title", "venue": "venue", "year": "year", "frequent_part_title": "frequent_part_title", "infrequent_part_title": "infrequent_part_title"}
jaccard_similarity_counter = {"authors" : 0, "title": 0, "venue":0, "year": 0, "frequent_part_title": 0, "infrequent_part_title": 0}

for index, row in dfScholarGoldStandard.iterrows():
    rowDblp = dfDblp[dfDblp['uri'] == row[0]]
    rowScholar = dfScholar[dfScholar['uri'] == row[1]]
    for key in mapping:
        if key in rowScholar and len(rowScholar[key].values) > 0 and mapping[key] in rowDblp and len(rowDblp[mapping[key]].values) > 0:
            if type(rowScholar[key].values[0]) not in[np.float64, float] and type(rowDblp[mapping[key]].values[0]) not in[np.float64, float]:
                jaccard_similarity_counter[key] += jaccard_distance(set(ngrams(str(rowDblp[mapping[key]].values[0]), 2)), set(ngrams(str(rowScholar[key].values[0]), 2)))
    
for key in jaccard_similarity_counter:
    jaccard_similarity_counter[key] = round(jaccard_similarity_counter[key] / len(dfScholarGoldStandard), 3)
 
#display(jaccard_similarity_counter)
dfScholarProfiling = dfScholarProfiling.append(pd.DataFrame(data=jaccard_similarity_counter, index=['Heterogeneity']))
dfScholarProfiling.to_csv('scholar_citation_profiling.csv', sep=',', encoding='utf-8', index=False, quotechar='"', quoting=csv.QUOTE_ALL)

print(dfScholarProfiling.transpose().filter(items=['Density', 'Uniqueness', 'Heterogeneity']).to_latex())

\begin{tabular}{llll}
\toprule
{} & Density & Uniqueness & Heterogeneity \\
\midrule
authors               &       1 &      0.866 &         0.229 \\
frequent\_part\_title   &   0.235 &      0.009 &         0.012 \\
infrequent\_part\_title &   0.999 &      0.961 &         0.072 \\
title                 &       1 &      0.976 &         0.075 \\
venue                 &   0.767 &      0.248 &          0.49 \\
year                  &       1 &      0.002 &         0.621 \\
\bottomrule
\end{tabular}



In [8]:
dfAggregatedProfiling = dfDblpProfiling.transpose().append(dfAcmProfiling.transpose())
#dfAggregatedProfiling = dfAggregatedProfiling.append(dfWebTablesProfiling.transpose())
dfAggregatedProfiling = dfAggregatedProfiling.append(dfScholarProfiling.transpose())

dfSummedProfiling = pd.DataFrame()
for index in set(dfAggregatedProfiling.index):
    profil = {}
    dfSubset = dfAggregatedProfiling.loc[index]
    profil['Attribute'] = index
    profil['Density-Mean'] = dfSubset['Density'].mean()
    profil['Density-Std'] = dfSubset['Density'].std()
    profil['Heterogeneity-Mean'] = dfSubset['Heterogeneity'].mean()
    profil['Heterogeneity-Std'] = dfSubset['Heterogeneity'].std()
    profil['Uniqueness-Mean'] = dfSubset['Uniqueness'].mean()
    profil['Uniqueness-Std'] = dfSubset['Uniqueness'].std()
    dfProfil = pd.DataFrame(profil, index=[0])
    dfSummedProfiling = dfSummedProfiling.append(dfProfil)

display(dfSummedProfiling)
dfSummedProfiling.to_csv('citation_profiling.csv', sep=',', encoding='utf-8', index=True, quotechar='"', quoting=csv.QUOTE_ALL)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Unnamed: 0,Attribute,Density-Mean,Density-Std,Heterogeneity-Mean,Heterogeneity-Std,Uniqueness-Mean,Uniqueness-Std
0,title,1.0,0.0,0.046,0.041012,0.815333,0.278283
0,venue,0.911,0.125861,0.604,0.16122,0.083667,0.142318
0,infrequent_part_title,0.999333,0.000577,0.044,0.039598,0.808333,0.274884
0,authors,0.985,0.021,0.1665,0.088388,0.883667,0.015695
0,frequent_part_title,0.456,0.191445,0.0065,0.007778,0.033333,0.026764
0,year,1.0,0.0,0.3105,0.439113,0.002667,0.001155
