In [None]:
import pandas as pd

file_path = "I:/Data_for_practice/SciSciNet/SciSciNet_Papers.tsv"

chunk_size = 1000000  
col_names = [
    "PaperID", "DOI", "DocType", "PaperTitle", "BookTitle", "Year", "Date", "Publisher", 
    "JournalID", "ConferenceSeriesID", "Volume", "Issue", "FirstPage", "LastPage", 
    "ReferenceCount", "CitationCount", "OriginalVenue", "FamilyID", "RetractionType"
]

error_rows = []

for chunk in pd.read_csv(file_path, sep="\t", names=col_names, dtype=str, chunksize=chunk_size, on_bad_lines="skip"):
    chunk["column_count"] = chunk.apply(lambda row: row.count(), axis=1)  
    
    error_rows.extend(chunk[chunk["column_count"] != len(col_names)].index.tolist())

if error_rows:
    print(f"⚠️ {len(error_rows)}개의 문제가 있는 행이 발견됨")
    print("첫 10개의 문제 있는 행 인덱스:", error_rows[:10])
else:
    print("✅ 모든 행이 정상적인 컬럼 개수를 가지고 있음!")


⚠️ 133368289개의 문제가 있는 행이 발견됨
첫 10개의 문제 있는 행 인덱스: [('2789336', nan, 'Journal', '2008.0', '2008-01-01', '204851967.0', nan, '1.0', '1.0'), ('9552966', nan, 'Journal', '2005.0', '2005-05-01', '2737568815.0', nan, '0.0', '0.0'), ('9694411', nan, 'Conference', '1994.0', '1994-01-01', nan, '2755927266.0', '0.0', '0.0'), ('14355006', nan, 'Repository', '2011.0', '2011-01-01', '3121261024.0', nan, '0.0', '0.0'), ('14558443', '10.1007/978-3-662-45174-8_28', 'Conference', '2014.0', '2014-10-12', nan, '1131603458.0', '14.0', nan), ('15354235', '10.1007/978-3-662-44777-2_60', 'Conference', '2014.0', '2014-09-08', nan, '1154039276.0', '9.0', nan), ('16763247', '10.22004/AG.ECON.174942', 'Repository', '2014.0', '2014-01-01', '3121261024.0', nan, '0.0', nan), ('19586282', nan, 'Journal', '2007.0', '2007-07-01', '131967590.0', nan, '0.0', '0.0'), ('21605082', nan, 'Conference', '2001.0', '2001-04-01', nan, '1190393634.0', '18.0', '14.0'), ('23032558', nan, 'Journal', '1997.0', '1997-10-01', '275531352

In [None]:
import pandas as pd

file_path = "I:/Data_for_practice/SciSciNet/SciSciNet_Papers.tsv"

df = pd.read_csv(file_path, sep="\t", dtype=str, nrows=10)

df["num_columns"] = df.apply(lambda row: len(row.values), axis=1)
print(df["num_columns"].value_counts())

print(f"파일의 총 컬럼 개수: {len(df.columns)}")

print(df.head())


num_columns
28    10
Name: count, dtype: int64
파일의 총 컬럼 개수: 29
    PaperID                           DOI     DocType    Year        Date  \
0   2789336                           NaN     Journal  2008.0  2008-01-01   
1   9552966                           NaN     Journal  2005.0  2005-05-01   
2   9694411                           NaN  Conference  1994.0  1994-01-01   
3  14355006                           NaN  Repository  2011.0  2011-01-01   
4  14558443  10.1007/978-3-662-45174-8_28  Conference  2014.0  2014-10-12   

      JournalID ConferenceSeriesID Citation_Count  C10 Reference_Count  ...  \
0   204851967.0                NaN            1.0  1.0            13.0  ...   
1  2737568815.0                NaN            0.0  0.0            37.0  ...   
2           NaN       2755927266.0            0.0  0.0             0.0  ...   
3  3121261024.0                NaN            0.0  0.0             0.0  ...   
4           NaN       1131603458.0           14.0  NaN            11.0  ...   


In [None]:
with open("I:/Data_for_practice/SciSciNet/SciSciNet_Papers.tsv", "r", encoding="utf-8") as f:
    first_line = f.readline()
    print(first_line.split("\t"))  


['PaperID', 'DOI', 'DocType', 'Year', 'Date', 'JournalID', 'ConferenceSeriesID', 'Citation_Count', 'C10', 'Reference_Count', 'C5', 'Team_Size', 'Institution_Count', 'Disruption', 'Atyp_10pct_Z', 'Atyp_Pairs', 'Atyp_Median_Z', 'SB_B', 'SB_T', 'Patent_Count', 'Newsfeed_Count', 'Tweet_Count', 'NCT_Count', 'NIH_Count', 'NSF_Count', 'WSB_mu', 'WSB_sigma', 'WSB_Cinf\n']


In [None]:
import pandas as pd

# 원본 CSV 파일 로드
df = pd.read_csv("I:/Data_for_practice/SciSciNet/SciSciNet_Papers.tsv", sep="\t")

# 50만 행씩 저장
chunk_size = 2000000  
for i, chunk in enumerate(range(0, len(df), chunk_size)):
    df.iloc[chunk:chunk + chunk_size].to_csv(f"split_{i}.csv", sep="\t", index=False)


In [1]:
import csv

input_file = 'I:/Data_for_practice/SciSciNet/SciSciNet_PaperDetails.tsv/SciSciNet_PaperDetails.tsv'
output_file = 'I:/Data_for_practice/SciSciNet/SciSciNet_PaperDetails.tsv/SciSciNet_PaperDetails_fixed.tsv'

with open(input_file, 'r', encoding='utf-8') as infile, \
     open(output_file, 'w', encoding='utf-8', newline='') as outfile:

    reader = csv.reader(infile, delimiter='\t')
    writer = csv.writer(outfile, delimiter='\t', quoting=csv.QUOTE_NONNUMERIC)

    for row in reader:
        writer.writerow(row)

print(f"수정된 TSV 파일이 '{output_file}'에 저장되었습니다.")

수정된 TSV 파일이 'I:/Data_for_practice/SciSciNet/SciSciNet_PaperDetails.tsv/SciSciNet_PaperDetails_fixed.tsv'에 저장되었습니다.
