In [1]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth

In [2]:
# The closed Frequent Patterns of authors would be mined by FP growth algorithm in mlxtend lib + closed pattern definition
path = 'E:/CS_Master_Degree_UIUC/CS410_Text_Information_system/Project/Project Submission/CourseProject/Dataset/'
dblp2000 = pd.read_csv(path + "DBLP2000.csv")
dblp2000["author"] = dblp2000.apply(lambda x: x["author"].split(", "), axis = 1) # turn to list of authors for each transaction
dblp2000.head()

Unnamed: 0,author,title
0,[Lothar Breuer],Operator-Geometric Solutions for the M/G/k Que...
1,"[Christopher Lusena, Judy Goldsmith, Martin Mu...",Nonapproximability Results for Partially Obser...
2,"[János Komlós, Ali Shokoufandeh, Miklós Simono...",The Regularity Lemma and Its Applications in G...
3,[Vijay V. Vazirani],Primal-Dual Schema Based Approximation Algorit...
4,"[Isabel Fernández-Anta, Eva Millán, José-Luis ...",Adaptation and Generation in a Web-Based Lisp ...


In [3]:
dblp2000['author'].iloc[2]

['János Komlós', 'Ali Shokoufandeh', 'Miklós Simonovits', 'Endre Szemerédi']

In [4]:
# Applied FPgrowth algorithm in MLXend to find frequent patterns given threshold support
dataset = list(dblp2000["author"])
te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset)
df = pd.DataFrame(te_ary, columns=te.columns_)
thresh = 4/df.shape[0] # used 4 instead of 10 supports as threshold due to smaller dataset
freq_df = fpgrowth(df, min_support=thresh, use_colnames=True)

In [5]:
freq_df

Unnamed: 0,support,itemsets
0,0.000999,(Ralf Steinmetz)
1,0.000999,(Cheng-Wen Wu)
2,0.000999,(Thomas S. Huang)
3,0.000999,(Maja J. Mataric)
4,0.001249,(K. Suzanne Barber)
5,0.000999,(Josef Kittler)
6,0.000999,(Gerald Sommer)
7,0.001249,(Edwin R. Hancock)
8,0.000999,(Masaru Kitsuregawa)
9,0.001249,(Roberto Gorrieri)


#### Find closed frequent itemset using frequent itemset 

In [6]:
su = freq_df.support.unique() #all unique support count
#Dictionay storing itemset with same support count key
fredic = {}
for i in range(len(su)):
    inset = freq_df.loc[freq_df.support ==su[i]]['itemsets'].apply(lambda x: list(x)).to_list()
    fredic[su[i]] = inset

In [7]:
cl = []
for index, row in freq_df.iterrows():
    isclose = True
    cli = [x for x in row['itemsets']]
    cls = row['support']
    checkset = fredic[cls]
    
    for i in checkset:
        if (cli!=i):
            if (all(x in i for x in cli)): 
                print(cli, i)
                isclose = False
                break
    
    if(isclose):
        cl.append(cli)   

In [9]:
closeFP_authors = [", ".join(list(x)) for x in cl]
authorsFP2000 = pd.DataFrame(columns = ["author"])
authorsFP2000["author"] = closeFP_authors
authorsFP2000

Unnamed: 0,author
0,Ralf Steinmetz
1,Cheng-Wen Wu
2,Thomas S. Huang
3,Maja J. Mataric
4,K. Suzanne Barber
5,Josef Kittler
6,Gerald Sommer
7,Edwin R. Hancock
8,Masaru Kitsuregawa
9,Roberto Gorrieri


### It would be helpful to add transaction index to the author patterns

In [10]:
transaction_index = []
for author in authorsFP2000['author']:
   
    ind = dblp2000['author'].apply(lambda a_list: author in a_list )
    
    transaction_index.append(dblp2000.loc[ind].index.tolist())

In [11]:
authorsFP2000['transaction_index']  = transaction_index

In [12]:
authorsFP2000

Unnamed: 0,author,transaction_index
0,Ralf Steinmetz,"[70, 1323, 1325, 3362]"
1,Cheng-Wen Wu,"[194, 196, 288, 1909]"
2,Thomas S. Huang,"[652, 835, 1012, 1123]"
3,Maja J. Mataric,"[660, 669, 2266, 2271]"
4,K. Suzanne Barber,"[707, 1311, 2129, 2444, 2448]"
5,Josef Kittler,"[737, 1108, 1113, 1573]"
6,Gerald Sommer,"[768, 770, 1569, 1721]"
7,Edwin R. Hancock,"[839, 1119, 1127, 1204, 1576]"
8,Masaru Kitsuregawa,"[1373, 2467, 3480, 3762]"
9,Roberto Gorrieri,"[1731, 1899, 2330, 3618, 3756]"


In [None]:
output_path = path
authorsFP2000.to_csv(output_path+"authorsFP2000_with_index.csv", index = False)