In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
loaded_df = pd.read_csv('Full_arXiv_data.csv')

In [3]:
# chekcing missing values
print("Missing values:")
print(loaded_df.isnull().sum())

Missing values:
Unnamed: 0    0
id            0
title         0
summary       0
published     0
authors       0
pdf_link      0
pdf_text      0
dtype: int64


In [13]:
# checking duplicate entries
duplicates = loaded_df.duplicated()
print('No Duplicates:', duplicates.shape[0])
print('Duplicate rows True/False:')
print(duplicates)

No Duplicates: 13384
Duplicate rows True/False:
0        False
1        False
2        False
3        False
4        False
         ...  
13379    False
13380    False
13381    False
13382    False
13383    False
Length: 13384, dtype: bool


In [5]:
# drop the rows without useful information, rows do not start with 'http'
loaded_df = loaded_df[loaded_df['id'].str.strip().str.lower().str.startswith('http')]

In [6]:
print(loaded_df.head())

   Unnamed: 0                                       id   
0           0  http://arxiv.org/abs/cond-mat/0102536v1  \
1           1         http://arxiv.org/abs/0710.5276v1   
2           2         http://arxiv.org/abs/0912.3431v1   
3           3  http://arxiv.org/abs/cond-mat/0312522v1   
4           4  http://arxiv.org/abs/cond-mat/0105367v1   

                                               title   
0  Impact of Electron-Electron Cusp on Configurat...  \
1  Electronegativity in quantum electronic transport   
2  Comment on "Electron transport through correla...   
3                      Correlated Electron Transport   
4     Many-Electron Systems with Constrained Current   

                                             summary             published   
0  The effect of the electron-electron cusp on th...  2001-02-28T20:12:09Z  \
1  Electronegativity is shown to control charge t...  2007-10-28T11:27:36Z   
2  The many electron correlated scattering (MECS)...  2009-12-17T18:05:36Z   
3 

In [7]:
# checking missing value in 'published' column
print("Number of missing values in 'published' column:", loaded_df['published'].isnull().sum())

Number of missing values in 'published' column: 0


In [8]:
# check the 'published' column before cleaning it
print("Old 'published' column:")
print(loaded_df['published'].head())

Old 'published' column:
0    2001-02-28T20:12:09Z
1    2007-10-28T11:27:36Z
2    2009-12-17T18:05:36Z
3    2003-12-19T11:56:11Z
4    2001-05-18T09:02:21Z
Name: published, dtype: object


In [9]:
# cobvert the 'published' column's data type from 'object' to 'date time'
loaded_df['published'] = pd.to_datetime(loaded_df['published'], errors='coerce')

In [10]:
# Convert into long date format 'Month/Day/Year/'
loaded_df['published'] = loaded_df['published'].dt.strftime('%m/%d/%Y')

In [11]:
# check the new 'published' column
print("New 'published' column:")
print(loaded_df['published'].head())

New 'published' column:
0    02/28/2001
1    10/28/2007
2    12/17/2009
3    12/19/2003
4    05/18/2001
Name: published, dtype: object


In [12]:
# drop the first column, 'Unnamed' which only contains cardinal numbers, characters, and sentence fragments
loaded_df = loaded_df.drop(loaded_df.columns[0], axis=1)
# df after dropping the 'Unnamed' column
print(loaded_df.head())

                                        id   
0  http://arxiv.org/abs/cond-mat/0102536v1  \
1         http://arxiv.org/abs/0710.5276v1   
2         http://arxiv.org/abs/0912.3431v1   
3  http://arxiv.org/abs/cond-mat/0312522v1   
4  http://arxiv.org/abs/cond-mat/0105367v1   

                                               title   
0  Impact of Electron-Electron Cusp on Configurat...  \
1  Electronegativity in quantum electronic transport   
2  Comment on "Electron transport through correla...   
3                      Correlated Electron Transport   
4     Many-Electron Systems with Constrained Current   

                                             summary   published   
0  The effect of the electron-electron cusp on th...  02/28/2001  \
1  Electronegativity is shown to control charge t...  10/28/2007   
2  The many electron correlated scattering (MECS)...  12/17/2009   
3  Theoretical and experimental values to date fo...  12/19/2003   
4  A formulation for transport in an inhomogen

In [23]:
output_csv_path = 'index_clean_full_arXiv_data5.csv'
loaded_df.to_csv(output_csv_path, index=True)