In [None]:
import sys
import requests, zipfile, io
import os
from tqdm import tqdm
import pandas as pd
import json
import numpy as np

## to split dataset
from sklearn.model_selection import train_test_split


## Downloading Citation data

In [None]:
filename = 'dblp.v10.zip'
url = 'http://aminer.org/lab-datasets/citation/dblp.v10.zip'

path = "Data"

if not os.path.exists(path):
    try:
        os.mkdir(path)
    except OSError:
        print ("Creation of the Data directory %s failed" % path)
    else:
        print ("Successfully created the Data directory %s " % path)

else:
    print("Data directory already exist.\n")

print("Downloading Citation data into " + path + " directory...")

with open('Data/'+filename, 'wb') as f:
        response = requests.get(url, stream=True)
        total = response.headers.get('content-length')

        if total is None:
            f.write(response.content)
        else:
            downloaded = 0
            total = int(total)
            for data in response.iter_content(chunk_size=max(int(total/1000), 1024*1024)):
                downloaded += len(data)
                f.write(data)
                done = int(50*downloaded/total)
                sys.stdout.write('\r[{}{}]'.format('█' * done, '.' * (50-done)))
                sys.stdout.flush()
        sys.stdout.write('\n Finished Downloading')

## Extracting Zip file

In [None]:
filename = 'dblp.v10.zip'
path = "Data"

filepath = path + "/" + filename
print("Extracting Citation data")
with zipfile.ZipFile(filepath) as zf:
        for member in tqdm(zf.infolist(), desc='Extracting '):
            try:
                zf.extract(member, path)
            except zipfile.error as e:
                pass
        print("Finished Extraction")

## Converting .json file .csv

In [None]:
with open('Data/dblp-ref/dblp-ref-3.json', 'r') as data_3:
     data3 = data_3.read() 
del data_3
data3 = '[' + data3.replace('}', '},', data3.count('}')-1) + ']'
data3_json = json.loads(data3)
del data3
data3 = pd.DataFrame.from_dict(data3_json, orient='columns')

In [None]:
data3.head()

## Applying filters for case one experiment

In [None]:
train_one = data3[data3.year < 2009] 

In [None]:
train_one.info()

In [None]:
test_one = data3[data3.year == 2009]

In [None]:
test_one.info()

In [None]:
train_one.reset_index(drop=True, inplace=True)
test_one.reset_index(drop=True, inplace=True)

In [None]:
train_one.info()

In [None]:
train_one.to_csv('Data/train_case_one.csv')
test_one.to_csv('Data/test_case_one.csv')

### Average citation count for venue

In [None]:
VenueNames = train_one['venue'].unique()

In [None]:
AvgVenueCite = pd.DataFrame(VenueNames)

In [None]:
AvgVenueCite.rename(index=str, columns={0: "venue"}, inplace=True)

In [None]:
AvgVenueCite.info(5)

In [None]:
for i, row in AvgVenueCite.iterrows():
    data = train[train.venue == row.venue]
    if len(data) != 0:
        avgCite = sum(data.n_citation)/len(data)
#         print(row.venue,":", avgCite)
        AvgVenueCite.set_value(i,'avgVenueCite',avgCite)

In [None]:
len(AvgVenueCite)

In [None]:
AvgVenueCite.to_csv('Data/AvgVenueCitationFromTrainData_case_one.csv')

## Apply filter for case two experiment

In [None]:
data3.info()

### Loading Webscraped Aminer conference data
##### from: http://aminer.org/ranks/conf

In [None]:
aminer = pd.read_csv('Data/Aminer_Conf_Ranks.csv', index_col='Unnamed: 0')

In [None]:
aminer.info()

In [None]:
aminer = aminer[['Rank', 'Conference', 'H5-Index']]
aminer.dropna(inplace=True)
aminer.rename(index=str, columns={"Conference": "venue"}, inplace=True)
aminer.info()

In [None]:
# removed papers having NaN in any feature.
data3.dropna(inplace=True)

In [None]:
data3.info()

In [None]:
# filter data based on known venue ranks and h5-index
train_test = pd.merge(data3, aminer, how='left', on=['venue'])
# train_test.drop(['Rank', 'H5-Index'], inplace=True, axis=1)

In [None]:
train_test.info()

In [None]:
# dropped rows having NaN after merging with Aminer conference data
train_test.dropna(inplace=True)

In [None]:
train_test.info()

In [None]:
train_test.reset_index(drop=True)

In [None]:
## removed rank and h5-index to keep the dataset with only orignal feature before feature engineering.
train_test.drop(['Rank', 'H5-Index'], inplace=True, axis=1)

In [None]:
len(train_test)

In [None]:
# droped duplicates papers if any
train_test[train_test['id'].duplicated(keep=False)]

In [None]:
train_test.drop_duplicates(['id'], keep='last', inplace=True)

In [None]:
len(train_test['id'].unique())

In [None]:
train_test.to_csv('Data/train_test_case_two.csv')

In [None]:
train_two, test_two = train_test_split(train_test, test_size=0.2)

In [None]:
train_two.info()

In [None]:
test_two.info()

In [None]:
train_two.reset_index(drop=True, inplace=True)
test_two.reset_index(drop=True, inplace=True)

In [None]:
train_two.to_csv('Data/train_case_two.csv')
test_two.to_csv('Data/test_case_two.csv')

### Average citation count for venue

In [None]:
VenueNames = train_two['venue'].unique()

In [None]:
AvgVenueCite = pd.DataFrame(VenueNames)

In [None]:
AvgVenueCite.rename(index=str, columns={0: "venue"}, inplace=True)

In [None]:
AvgVenueCite.info(5)

In [None]:
for i, row in AvgVenueCite.iterrows():
    data = train[train.venue == row.venue]
    if len(data) != 0:
        avgCite = sum(data.n_citation)/len(data)
#         print(row.venue,":", avgCite)
        AvgVenueCite.set_value(i,'avgVenueCite',avgCite)

In [None]:
len(AvgVenueCite)

In [None]:
AvgVenueCite.to_csv('Data/AvgVenueCitationFromTrainData_case_two.csv')