### Setting Up

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/MyDrive/Manuscript-Matcher

### Data Pre-processing

#### Load CSV files

In [None]:
journal_df = pd.read_csv("combined_journal_ranking_data.csv")

In [None]:
articles_df = pd.read_csv("doaj_articles_data.csv")

In [None]:
asjc_df = pd.read_csv("asjc_codes.csv")

#### Data cleaning

In [None]:
journal_df = journal_df[['Title', 'Open Access', 'Best Quartile', 'ASJC Codes', 'ISO Language Codes']]
journal_df.rename(columns={'Title': 'Journal Name'}, inplace=True)
journal_df['Journal Name'] = journal_df['Journal Name'].apply(lambda x: x.upper())

In [None]:
articles_df = articles_df[['Journal Name', 'Title', 'Abstract']]
articles_df['Journal Name'] = articles_df['Journal Name'].apply(lambda x: x.rsplit('(', 1)[0].strip())

In [None]:
articles_df.duplicated().sum()

In [None]:
articles_df.drop_duplicates(inplace=True)

In [None]:
articles_df.isna().sum()

In [None]:
articles_df.dropna(inplace=True)

In [None]:
articles_df.shape

#### Merge two datasets

In [None]:
df = articles_df.merge(journal_df, on='Journal Name', how='left')

In [None]:
articles_df.shape, journal_df.shape, df.shape

#### Final data cleaning

In [None]:
df.duplicated().sum()

In [None]:
df['Abstract'].duplicated().sum()

In [None]:
df['Title'].duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)
df.drop_duplicates(subset=['Abstract'], inplace=True)
df.drop_duplicates(subset=['Title'], inplace=True)

In [None]:
df.isna().sum()

In [None]:
df.dropna(inplace=True)

In [None]:
(df['Open Access'] == False).sum()

In [None]:
(df['ISO Language Codes'] != 'ENG').sum()

In [None]:
df.drop(columns=['Open Access', 'Best Quartile', 'ISO Language Codes'], inplace=True)

In [None]:
df.rename(columns={'Journal Name': 'journal', 'Title': 'title', 'Abstract': 'abstract', 'ASJC Codes': 'asjc_codes'}, inplace=True)

In [None]:
df['journal'].value_counts()

In [None]:
df = df.drop(df[df.journal == 'HELIYON'][1001:].index).reset_index(drop=True)

In [None]:
df = df.drop(df[df.journal == 'SUSTAINABILITY'][1001:].index).reset_index(drop=True)

In [None]:
df = df.drop(df[df.journal == 'ENVIRONMENTAL RESEARCH LETTERS'][1001:].index).reset_index(drop=True)

In [None]:
df.shape

### Save data

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.head()

In [None]:
df.to_csv('/content/drive/MyDrive/Manuscript-Matcher/data.csv', index=False)