In [1]:
import pandas as pd
import numpy as np
import json
import re
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from wordsegment import load, segment
from sklearn.preprocessing import StandardScaler, LabelEncoder, FunctionTransformer, OneHotEncoder, PolynomialFeatures
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.metrics import r2_score, mean_absolute_error, make_scorer
from sklearn.linear_model import Ridge
from sklearn.ensemble import GradientBoostingRegressor, HistGradientBoostingRegressor
from sklearn.dummy import DummyRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.cluster import KMeans
from scipy.stats import uniform, randint

In [2]:
# load the data
with open('train.json', 'r', encoding='utf-8') as f:
    train = pd.DataFrame.from_records(json.load(f))

with open('test.json', 'r', encoding='utf-8') as f:
    test = pd.DataFrame.from_records(json.load(f))

In [3]:
train.head()

Unnamed: 0,abstract,authors,n_citation,references,title,venue,year,id
0,A complete method to create a panoramic video ...,"Chiou-Ting Hsu, Tzu-Hung Cheng, Rob A. Beuker,...",51,"[23daf90f-0393-4382-a964-b3c8b0bcb99e, 31364f4...",Feature-based video mosaic,international conference on image processing,2000,2cd9f8fb-7440-418c-8a96-21d3c7241ca0
1,This paper presents a novel dimension reductio...,"Senjian An, Wanquan Liu, Svetha Venkatesh, Ron...",1,"[0781e713-d8ca-4f62-89e8-3047b77dd6e6, 9584903...",A Fast Feature-based Dimension Reduction Algor...,Neural Processing Letters,2006,7bc0edcd-6a07-4205-94a2-52437bc27e49
2,The present study evaluates the internal infor...,Sanjay Dhingra,0,"[1bc37ef7-52fb-4892-bf29-d3a331031ff3, 3fa8250...",Managing the internal IT service quality in pu...,business information systems,2015,d024d697-5e3c-4a74-a319-49f5157189ca
3,Testing database application is challenging be...,"Carsten Binnig, Donald Kossmann, Eric Lo",50,"[21e89a64-c646-4b21-88c1-1485f4e5e56e, 38ad366...",Testing database applications,international conference on management of data,2006,6978167c-dd85-4e29-bd1b-c87f84c3232d
4,This paper presents an algorithm that performs...,"Phillip E. Mitchell, Hong Yan",1,"[452cd918-88f1-4dcf-8b5b-3a90777ce061, 4807e49...",Connected pattern segmentation and title group...,international conference on pattern recognition,2004,f314d1c5-c509-4352-9173-ad7fd6f35188


In [4]:
test.head()

Unnamed: 0,abstract,authors,references,title,venue,year
0,QuickCheck is a previously published random te...,"Koen Claessen, John Hughes","[29e038a1-12d0-40c7-94ac-6b9e184d94d3, 497520d...",Testing monadic code with QuickCheck,symposium/workshop on haskell,2002
1,The ability that users have to interact and ch...,"Edirlei Soares de Lima, Bruno Feijó, Simone Di...","[008c6924-ef5c-4f67-8b77-e92829ee9b28, 040b125...","Multimodal, Multi-user and Adaptive Interactio...",,2011
2,Hybrid-core systems speedup applications by of...,"Mitesh R. Meswani, Laura Carrington, Didem Una...","[035d2bb9-16d0-4fc6-9492-157b5f39705c, 208229d...",Modeling and predicting performance of high pe...,International Journal of High Performance Comp...,2013
3,Discerning the prevalence of e-government syst...,"Calvin Meng Lai Chan, Shan Ling Pan","[0c629499-abbf-4e67-8113-97399512f89f, 19d9170...",Resource Enactment in e-Government Systems Imp...,international conference on information systems,2006
4,"Wikipedia, a web-based collaboratively maintai...","Marcelo Yuji Himoro, Raíza Hanada, Marco Crist...","[0b26af09-c9b9-475f-a216-baa14fe42cd7, 1d09fed...",An investigation of the relationship between t...,brazilian symposium on multimedia and the web,2013


In [5]:
# Check the number of rows and columns in the dataset 
train.shape

(1189209, 8)

In [6]:
# Have an overview of the dataset
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1189209 entries, 0 to 1189208
Data columns (total 8 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   abstract    1189209 non-null  object
 1   authors     1189209 non-null  object
 2   n_citation  1189209 non-null  int64 
 3   references  1189209 non-null  object
 4   title       1189209 non-null  object
 5   venue       1189209 non-null  object
 6   year        1189209 non-null  int64 
 7   id          1189209 non-null  object
dtypes: int64(2), object(6)
memory usage: 72.6+ MB


In [8]:
# Check the proportion of missing values in each column
missing_percentage = (train.isna().sum() / len(train)) * 100
missing_percentage

abstract      0.0
authors       0.0
n_citation    0.0
references    0.0
title         0.0
venue         0.0
year          0.0
id            0.0
dtype: float64